import os import bs4 import logging import requests from bs4 import BeautifulSoup from markdownify import markdownify output_dir = "./TTOU_2023" logging.basicConfig(format="%(asctime)s %(levelname)s:%(message)s", level=logging.INFO) def parse_apgte(url): html = requests.get(url).text soup = BeautifulSoup(html, "html.parser") header = soup.find(attrs={"class": "entry-title"}) article = soup.find(attrs={"class": "entry-content"}) article.div.decompose() # delete sharedaddy next_link = soup.find(attrs={"rel": "next"}) return (next_link.get("href") if next_link else None, header, article) def chop_between_finds(s, f1, f2): lines = s.splitlines() f1_occ = [idx for idx, line in enumerate(lines) if f1 in line] f2_occ = [idx for idx, line in enumerate(lines) if f2 in line] if f1 != f2 else f1_occ idx0 = f1_occ[0] if f1_occ else 0 idx1 = f2_occ[-1] if f2_occ else -1 return "\n".join(lines[idx0 + 1: idx1]) if __name__ == "__main__": next_url = ( "https://derinstories.com/2022/06/04/001-the-problem-with-the-javelin-program/" ) count = 0 book = 1 chapter = 0 index = [] if not os.path.exists(output_dir): os.mkdir(output_dir) # while next_url and count < 30: while next_url: logging.info(f"parsing: {next_url}") try: next_url, title, text = parse_apgte(next_url) except Exception: logging.exception(f"Failed to parse: {next_url}") # if not os.path.exists(f'{output_dir}/Book-{book}'): # os.mkdir(f'{output_dir}/Book-{book}') # if not os.path.exists(f'{output_dir}/Book-{book}/md'): # os.mkdir(f'{output_dir}/Book-{book}/md') if not os.path.exists(f"{output_dir}/md"): os.mkdir(f"{output_dir}/md") # filename = f'{output_dir}/Book-{book}/md/Ch-{chapter}.md' filename = f"{output_dir}/md/Ch-{chapter:03d}.md" index.append(f"Ch-{chapter:03d}.md:\t{title.string}") with open(filename, "w") as mdfile: mdfile.write(markdownify(str(title))) # mdfile.write('\n\n') txt_md = markdownify(str(text)) txt_md = chop_between_finds(txt_md, ".png", ".png") mdfile.write(txt_md) # if next_url and 'prologue' in next_url: # with open(f'{output_dir}/Book-{book}/index.txt', 'w') as txtfile: # txtfile.write("\n".join(index)) # index = [] # book += 1 # chapter = 0 # else: chapter += 1 count += 1 with open(f"{output_dir}/index.txt", "w") as txtfile: txtfile.write("\n".join(index)) logging.info("Done!")