webcrawl/apgte_download.py3
2025-02-21 10:27:16 +01:00

76 lines
1.9 KiB
Python

import os
import bs4
import logging
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify
logging.basicConfig(
format='%(asctime)s %(levelname)s:%(message)s',
level=logging.INFO)
def parse_apgte(url):
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
header = soup.find(attrs={'class':'entry-title'})
article = soup.find(attrs={'class':'entry-content'})
article.div.decompose() # delete sharedaddy
next_link = soup.find(attrs={'rel':'next'})
return (next_link.get('href') if next_link else None, header, article)
if __name__ == '__main__':
next_url = 'https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/'
count =0
book = 1
chapter = 0
index = []
if not os.path.exists('./APGTE'):
os.mkdir('APGTE')
while next_url and count<30:
logging.info(f'parsing: {next_url}')
try:
next_url, title, text = parse_apgte(next_url)
except Exception:
logging.exception(f'Failed to parse: {next_url}')
if not os.path.exists(f'./APGTE/Book-{book}'):
os.mkdir(f'./APGTE/Book-{book}')
if not os.path.exists(f'./APGTE/Book-{book}/md'):
os.mkdir(f'./APGTE/Book-{book}/md')
filename = f'./APGTE/Book-{book}/md/Ch-{chapter}.md'
index.append(f'Ch-{chapter}.md:\t{title.string}')
with open(filename,'w') as mdfile:
mdfile.write(markdownify(str(title)))
mdfile.write('\n\n')
mdfile.write(markdownify(str(text)))
if 'prologue' in next_url:
with open(f'./APGTE/Book-{book}/index.txt','w') as txtfile:
txtfile.write("\n".join(index))
index=[]
book += 1
chapter = 0
else:
chapter += 1
count +=1
logging.info('Done!')