import os import bs4 import logging import requests from bs4 import BeautifulSoup from markdownify import markdownify logging.basicConfig( format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO) def parse_apgte(url): html = requests.get(url).text soup = BeautifulSoup(html, 'html.parser') header = soup.find(attrs={'class':'entry-title'}) article = soup.find(attrs={'class':'entry-content'}) article.div.decompose() # delete sharedaddy next_link = soup.find(attrs={'rel':'next'}) return (next_link.get('href') if next_link else None, header, article) if __name__ == '__main__': next_url = 'https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/' count =0 book = 1 chapter = 0 index = [] if not os.path.exists('./APGTE'): os.mkdir('APGTE') while next_url and count<30: logging.info(f'parsing: {next_url}') try: next_url, title, text = parse_apgte(next_url) except Exception: logging.exception(f'Failed to parse: {next_url}') if not os.path.exists(f'./APGTE/Book-{book}'): os.mkdir(f'./APGTE/Book-{book}') if not os.path.exists(f'./APGTE/Book-{book}/md'): os.mkdir(f'./APGTE/Book-{book}/md') filename = f'./APGTE/Book-{book}/md/Ch-{chapter}.md' index.append(f'Ch-{chapter}.md:\t{title.string}') with open(filename,'w') as mdfile: mdfile.write(markdownify(str(title))) mdfile.write('\n\n') mdfile.write(markdownify(str(text))) if 'prologue' in next_url: with open(f'./APGTE/Book-{book}/index.txt','w') as txtfile: txtfile.write("\n".join(index)) index=[] book += 1 chapter = 0 else: chapter += 1 count +=1 logging.info('Done!')