webcrawl/ttou_download.py

import os
import bs4
import logging
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify

output_dir = "./TTOU_2023"

logging.basicConfig(format="%(asctime)s %(levelname)s:%(message)s", level=logging.INFO)


def parse_apgte(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")
    header = soup.find(attrs={"class": "entry-title"})
    article = soup.find(attrs={"class": "entry-content"})
    article.div.decompose()  # delete sharedaddy
    next_link = soup.find(attrs={"rel": "next"})

    return (next_link.get("href") if next_link else None, header, article)


def chop_between_finds(s, f1, f2):
    lines = s.splitlines()
    f1_occ = [idx for idx, line in enumerate(lines) if f1 in line]
    f2_occ = [idx for idx, line in enumerate(lines) if f2 in line] if f1 != f2 else f1_occ
    idx0 = f1_occ[0] if f1_occ else 0
    idx1 = f2_occ[-1] if f2_occ else -1
    return "\n".join(lines[idx0 + 1: idx1])


if __name__ == "__main__":
    next_url = (
        "https://derinstories.com/2022/06/04/001-the-problem-with-the-javelin-program/"
    )
    count = 0
    book = 1
    chapter = 0

    index = []

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    # while next_url and count < 30:
    while next_url:
        logging.info(f"parsing: {next_url}")

        try:
            next_url, title, text = parse_apgte(next_url)
        except Exception:
            logging.exception(f"Failed to parse: {next_url}")

        # if not os.path.exists(f'{output_dir}/Book-{book}'):
        #     os.mkdir(f'{output_dir}/Book-{book}')
        # if not os.path.exists(f'{output_dir}/Book-{book}/md'):
        #     os.mkdir(f'{output_dir}/Book-{book}/md')
        if not os.path.exists(f"{output_dir}/md"):
            os.mkdir(f"{output_dir}/md")

        # filename = f'{output_dir}/Book-{book}/md/Ch-{chapter}.md'
        filename = f"{output_dir}/md/Ch-{chapter:03d}.md"

        index.append(f"Ch-{chapter:03d}.md:\t{title.string}")

        with open(filename, "w") as mdfile:
            mdfile.write(markdownify(str(title)))
            # mdfile.write('\n\n')

            txt_md = markdownify(str(text))
            txt_md = chop_between_finds(txt_md, ".png", ".png")
            mdfile.write(txt_md)

            # if next_url and 'prologue' in next_url:
            #     with open(f'{output_dir}/Book-{book}/index.txt', 'w') as txtfile:
            #         txtfile.write("\n".join(index))

            #     index = []
            #     book += 1
            #     chapter = 0
            # else:
            chapter += 1

        count += 1
    with open(f"{output_dir}/index.txt", "w") as txtfile:
        txtfile.write("\n".join(index))
    logging.info("Done!")