78 lines
2.5 KiB
Python
Executable file
78 lines
2.5 KiB
Python
Executable file
#!.venv/bin/python3
|
|
import pdfkit
|
|
import weasyprint
|
|
import re
|
|
import os
|
|
import argparse
|
|
import pickle
|
|
from htmldate import find_date
|
|
from datetime import date
|
|
|
|
|
|
# CLI arguments
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--backend', '-b', dest = 'backend', help = 'change the download backend; default: pdfkit', default = 'pdfkit', choices = ['pdfkit', 'weasyprint'])
|
|
parser.add_argument('--force', '-f', dest = 'force', help = 'force download all links instead of only the ones that need to be updated; default: False', default = False, choices = [False, True])
|
|
args = parser.parse_args()
|
|
|
|
|
|
for filename in os.listdir("Links"):
|
|
if not filename.endswith(".txt"): continue
|
|
if filename == "requirements.txt": continue
|
|
|
|
print("Examining: " + filename)
|
|
|
|
try:
|
|
os.mkdir(filename[:-4])
|
|
except: # I love bad error handling
|
|
pass
|
|
|
|
try:
|
|
dates = pickle.load(open(os.path.join("Links", filename[:-4] + ".pickle"), 'rb'))
|
|
except:
|
|
dates = {}
|
|
|
|
file = open(os.path.join("Links", filename), "r")
|
|
links = file.readlines()
|
|
for link in links:
|
|
if link[0] == "#" or link[0] == "\n": continue
|
|
|
|
new_date_str = find_date(link[:-1])
|
|
|
|
if new_date_str == None:
|
|
new_date = date.fromisoformat("9999-01-01")
|
|
else:
|
|
new_date = date.fromisoformat(new_date_str)
|
|
|
|
try:
|
|
old_date = dates[link[:-1]]
|
|
except:
|
|
old_date = date.fromisoformat("1970-01-01")
|
|
|
|
|
|
if new_date > old_date or args.force:
|
|
print("Downloading: " + link[:-1])
|
|
print("Edit date: " + str(new_date))
|
|
|
|
name = os.path.join(filename[:-4], re.sub(r'(?u)[^-\w.]', '', link[5:]) + ".pdf")
|
|
# name = re.sub(r'(?u)[^-\w.]', '', link[5:]) + ".pdf"
|
|
# print(name)
|
|
try:
|
|
# weasyprint seems faster?
|
|
# but seems to be broken sometimes???
|
|
if args.backend == 'pdfkit':
|
|
pdfkit.from_url(link, name)
|
|
else:
|
|
pdf = weasyprint.HTML(link).write_pdf()
|
|
open(name, 'wb').write(pdf)
|
|
except: # Maybe should handle errors a little bit better?
|
|
print("Error when printing")
|
|
pass
|
|
|
|
|
|
if new_date != date.fromisoformat("9999-01-01"):
|
|
dates[link[:-1]] = new_date
|
|
|
|
|
|
pickle.dump(dates, open(os.path.join("Links", filename[:-4] + ".pickle"), 'wb'))
|
|
|