88 lines
2.7 KiB
Python
Executable file
88 lines
2.7 KiB
Python
Executable file
import os
|
|
import bs4
|
|
import logging
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from markdownify import markdownify
|
|
|
|
output_dir = "./TTOU_2023"
|
|
|
|
logging.basicConfig(format="%(asctime)s %(levelname)s:%(message)s", level=logging.INFO)
|
|
|
|
|
|
def parse_apgte(url):
|
|
html = requests.get(url).text
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
header = soup.find(attrs={"class": "entry-title"})
|
|
article = soup.find(attrs={"class": "entry-content"})
|
|
article.div.decompose() # delete sharedaddy
|
|
next_link = soup.find(attrs={"rel": "next"})
|
|
|
|
return (next_link.get("href") if next_link else None, header, article)
|
|
|
|
|
|
def chop_between_finds(s, f1, f2):
|
|
lines = s.splitlines()
|
|
f1_occ = [idx for idx, line in enumerate(lines) if f1 in line]
|
|
f2_occ = [idx for idx, line in enumerate(lines) if f2 in line] if f1 != f2 else f1_occ
|
|
idx0 = f1_occ[0] if f1_occ else 0
|
|
idx1 = f2_occ[-1] if f2_occ else -1
|
|
return "\n".join(lines[idx0 + 1: idx1])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
next_url = (
|
|
"https://derinstories.com/2022/06/04/001-the-problem-with-the-javelin-program/"
|
|
)
|
|
count = 0
|
|
book = 1
|
|
chapter = 0
|
|
|
|
index = []
|
|
|
|
if not os.path.exists(output_dir):
|
|
os.mkdir(output_dir)
|
|
|
|
# while next_url and count < 30:
|
|
while next_url:
|
|
logging.info(f"parsing: {next_url}")
|
|
|
|
try:
|
|
next_url, title, text = parse_apgte(next_url)
|
|
except Exception:
|
|
logging.exception(f"Failed to parse: {next_url}")
|
|
|
|
# if not os.path.exists(f'{output_dir}/Book-{book}'):
|
|
# os.mkdir(f'{output_dir}/Book-{book}')
|
|
# if not os.path.exists(f'{output_dir}/Book-{book}/md'):
|
|
# os.mkdir(f'{output_dir}/Book-{book}/md')
|
|
if not os.path.exists(f"{output_dir}/md"):
|
|
os.mkdir(f"{output_dir}/md")
|
|
|
|
# filename = f'{output_dir}/Book-{book}/md/Ch-{chapter}.md'
|
|
filename = f"{output_dir}/md/Ch-{chapter:03d}.md"
|
|
|
|
index.append(f"Ch-{chapter:03d}.md:\t{title.string}")
|
|
|
|
with open(filename, "w") as mdfile:
|
|
mdfile.write(markdownify(str(title)))
|
|
# mdfile.write('\n\n')
|
|
|
|
txt_md = markdownify(str(text))
|
|
txt_md = chop_between_finds(txt_md, ".png", ".png")
|
|
mdfile.write(txt_md)
|
|
|
|
# if next_url and 'prologue' in next_url:
|
|
# with open(f'{output_dir}/Book-{book}/index.txt', 'w') as txtfile:
|
|
# txtfile.write("\n".join(index))
|
|
|
|
# index = []
|
|
# book += 1
|
|
# chapter = 0
|
|
# else:
|
|
chapter += 1
|
|
|
|
count += 1
|
|
with open(f"{output_dir}/index.txt", "w") as txtfile:
|
|
txtfile.write("\n".join(index))
|
|
logging.info("Done!")
|