webcrawl/ttou_download.py
2024-09-27 13:17:46 +02:00

88 lines
2.7 KiB
Python

import os
import bs4
import logging
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify
output_dir = "./TTOU_2023"
logging.basicConfig(format="%(asctime)s %(levelname)s:%(message)s", level=logging.INFO)
def parse_apgte(url):
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser")
header = soup.find(attrs={"class": "entry-title"})
article = soup.find(attrs={"class": "entry-content"})
article.div.decompose() # delete sharedaddy
next_link = soup.find(attrs={"rel": "next"})
return (next_link.get("href") if next_link else None, header, article)
def chop_between_finds(s, f1, f2):
lines = s.splitlines()
f1_occ = [idx for idx, line in enumerate(lines) if f1 in line]
f2_occ = [idx for idx, line in enumerate(lines) if f2 in line] if f1 != f2 else f1_occ
idx0 = f1_occ[0] if f1_occ else 0
idx1 = f2_occ[-1] if f2_occ else -1
return "\n".join(lines[idx0 + 1: idx1])
if __name__ == "__main__":
next_url = (
"https://derinstories.com/2022/06/04/001-the-problem-with-the-javelin-program/"
)
count = 0
book = 1
chapter = 0
index = []
if not os.path.exists(output_dir):
os.mkdir(output_dir)
# while next_url and count < 30:
while next_url:
logging.info(f"parsing: {next_url}")
try:
next_url, title, text = parse_apgte(next_url)
except Exception:
logging.exception(f"Failed to parse: {next_url}")
# if not os.path.exists(f'{output_dir}/Book-{book}'):
# os.mkdir(f'{output_dir}/Book-{book}')
# if not os.path.exists(f'{output_dir}/Book-{book}/md'):
# os.mkdir(f'{output_dir}/Book-{book}/md')
if not os.path.exists(f"{output_dir}/md"):
os.mkdir(f"{output_dir}/md")
# filename = f'{output_dir}/Book-{book}/md/Ch-{chapter}.md'
filename = f"{output_dir}/md/Ch-{chapter:03d}.md"
index.append(f"Ch-{chapter:03d}.md:\t{title.string}")
with open(filename, "w") as mdfile:
mdfile.write(markdownify(str(title)))
# mdfile.write('\n\n')
txt_md = markdownify(str(text))
txt_md = chop_between_finds(txt_md, ".png", ".png")
mdfile.write(txt_md)
# if next_url and 'prologue' in next_url:
# with open(f'{output_dir}/Book-{book}/index.txt', 'w') as txtfile:
# txtfile.write("\n".join(index))
# index = []
# book += 1
# chapter = 0
# else:
chapter += 1
count += 1
with open(f"{output_dir}/index.txt", "w") as txtfile:
txtfile.write("\n".join(index))
logging.info("Done!")