76 lines
1.9 KiB
Python
76 lines
1.9 KiB
Python
import os
|
|
import bs4
|
|
import logging
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from markdownify import markdownify
|
|
|
|
|
|
|
|
|
|
logging.basicConfig(
|
|
format='%(asctime)s %(levelname)s:%(message)s',
|
|
level=logging.INFO)
|
|
|
|
|
|
def parse_apgte(url):
|
|
html = requests.get(url).text
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
header = soup.find(attrs={'class':'entry-title'})
|
|
article = soup.find(attrs={'class':'entry-content'})
|
|
article.div.decompose() # delete sharedaddy
|
|
next_link = soup.find(attrs={'rel':'next'})
|
|
|
|
return (next_link.get('href') if next_link else None, header, article)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
next_url = 'https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/'
|
|
count =0
|
|
book = 1
|
|
chapter = 0
|
|
|
|
index = []
|
|
|
|
if not os.path.exists('./APGTE'):
|
|
os.mkdir('APGTE')
|
|
|
|
while next_url and count<30:
|
|
logging.info(f'parsing: {next_url}')
|
|
|
|
try:
|
|
next_url, title, text = parse_apgte(next_url)
|
|
except Exception:
|
|
logging.exception(f'Failed to parse: {next_url}')
|
|
|
|
|
|
if not os.path.exists(f'./APGTE/Book-{book}'):
|
|
os.mkdir(f'./APGTE/Book-{book}')
|
|
if not os.path.exists(f'./APGTE/Book-{book}/md'):
|
|
os.mkdir(f'./APGTE/Book-{book}/md')
|
|
|
|
filename = f'./APGTE/Book-{book}/md/Ch-{chapter}.md'
|
|
|
|
index.append(f'Ch-{chapter}.md:\t{title.string}')
|
|
|
|
with open(filename,'w') as mdfile:
|
|
|
|
mdfile.write(markdownify(str(title)))
|
|
mdfile.write('\n\n')
|
|
mdfile.write(markdownify(str(text)))
|
|
|
|
if 'prologue' in next_url:
|
|
with open(f'./APGTE/Book-{book}/index.txt','w') as txtfile:
|
|
txtfile.write("\n".join(index))
|
|
|
|
index=[]
|
|
book += 1
|
|
chapter = 0
|
|
else:
|
|
chapter += 1
|
|
|
|
count +=1
|
|
|
|
logging.info('Done!')
|
|
|