webcrawl/apgte_download.py3

import os
import bs4
import logging
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify


logging.basicConfig(
    format='%(asctime)s %(levelname)s:%(message)s',
    level=logging.INFO)


def parse_apgte(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    header = soup.find(attrs={'class':'entry-title'})
    article = soup.find(attrs={'class':'entry-content'})
    article.div.decompose() # delete sharedaddy
    next_link = soup.find(attrs={'rel':'next'})

    return (next_link.get('href') if next_link else None, header, article)


if __name__ == '__main__':

    next_url = 'https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/'
    count =0
    book = 1
    chapter = 0

    index = []

    if not os.path.exists('./APGTE'):
        os.mkdir('APGTE')

    while next_url and count<30:
        logging.info(f'parsing: {next_url}')

        try:
            next_url, title, text = parse_apgte(next_url)
        except Exception:
            logging.exception(f'Failed to parse: {next_url}')


        if not os.path.exists(f'./APGTE/Book-{book}'):
            os.mkdir(f'./APGTE/Book-{book}')
        if not os.path.exists(f'./APGTE/Book-{book}/md'):
            os.mkdir(f'./APGTE/Book-{book}/md')

        filename = f'./APGTE/Book-{book}/md/Ch-{chapter}.md'

        index.append(f'Ch-{chapter}.md:\t{title.string}')

        with open(filename,'w') as mdfile:

            mdfile.write(markdownify(str(title)))
            mdfile.write('\n\n')
            mdfile.write(markdownify(str(text)))

        if 'prologue' in next_url:
            with open(f'./APGTE/Book-{book}/index.txt','w') as txtfile:
                txtfile.write("\n".join(index))

            index=[]
            book += 1
            chapter = 0
        else:
            chapter += 1

        count +=1

    logging.info('Done!')