2020-12-08 03:57:46 +00:00
|
|
|
#!venv/bin/python3
|
2020-11-28 00:29:13 +00:00
|
|
|
import pdfkit
|
2020-11-28 01:09:07 +00:00
|
|
|
import weasyprint
|
2020-11-28 00:08:09 +00:00
|
|
|
import re
|
2020-11-28 00:29:13 +00:00
|
|
|
import os
|
2020-11-28 01:09:07 +00:00
|
|
|
import argparse
|
2020-12-14 03:23:26 +00:00
|
|
|
import pickle
|
|
|
|
from htmldate import find_date
|
|
|
|
from datetime import date
|
2020-11-28 01:09:07 +00:00
|
|
|
|
|
|
|
|
|
|
|
# CLI arguments
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument('--backend', '-b', dest = 'backend', help = 'change the download backend; default: pdfkit', default = 'pdfkit', choices = ['pdfkit', 'weasyprint'])
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
2020-11-28 00:08:09 +00:00
|
|
|
|
2020-11-28 00:29:13 +00:00
|
|
|
for filename in os.listdir("."):
|
|
|
|
if not filename.endswith(".txt"): continue
|
2020-12-08 03:57:46 +00:00
|
|
|
if filename == "requirements.txt": continue
|
2020-11-28 00:29:13 +00:00
|
|
|
|
|
|
|
print("Examining: " + filename)
|
|
|
|
|
2020-11-28 00:08:09 +00:00
|
|
|
try:
|
2020-11-28 00:29:13 +00:00
|
|
|
os.mkdir(filename[:-4])
|
2020-11-28 00:08:09 +00:00
|
|
|
except:
|
|
|
|
pass
|
2020-12-14 03:23:26 +00:00
|
|
|
|
|
|
|
try:
|
|
|
|
dates = pickle.load(open(filename[:-4] + ".pickle", 'rb'))
|
|
|
|
except:
|
|
|
|
dates = {}
|
2020-11-28 00:29:13 +00:00
|
|
|
|
|
|
|
file = open(filename, "r")
|
|
|
|
links = file.readlines()
|
|
|
|
for link in links:
|
2020-12-14 03:23:26 +00:00
|
|
|
new_date_str = find_date(link[:-1])
|
|
|
|
|
|
|
|
if new_date_str == None:
|
|
|
|
new_date = date.fromisoformat("9999-01-01")
|
|
|
|
else:
|
|
|
|
new_date = date.fromisoformat(new_date_str)
|
|
|
|
|
2020-11-28 00:29:13 +00:00
|
|
|
try:
|
2020-12-14 03:23:26 +00:00
|
|
|
old_date = dates[link[:-1]]
|
2020-11-28 00:29:13 +00:00
|
|
|
except:
|
2020-12-14 03:23:26 +00:00
|
|
|
old_date = date.fromisoformat("1970-01-01")
|
|
|
|
|
|
|
|
|
|
|
|
if new_date > old_date:
|
|
|
|
print("Downloading: " + link[:-1])
|
|
|
|
print("Edit date: " + str(new_date))
|
|
|
|
|
|
|
|
name = os.path.join(filename[:-4], re.sub(r'(?u)[^-\w.]', '', link[5:]) + ".pdf")
|
|
|
|
# name = re.sub(r'(?u)[^-\w.]', '', link[5:]) + ".pdf"
|
|
|
|
# print(name)
|
|
|
|
try:
|
|
|
|
# weasyprint seems faster?
|
|
|
|
if args.backend == 'pdfkit':
|
|
|
|
pdfkit.from_url(link, name)
|
|
|
|
else:
|
|
|
|
pdf = weasyprint.HTML(link).write_pdf()
|
|
|
|
open(name, 'wb').write(pdf)
|
|
|
|
except:
|
2020-12-14 03:29:11 +00:00
|
|
|
print("Error when printing")
|
2020-12-14 03:23:26 +00:00
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
if new_date != date.fromisoformat("9999-01-01"):
|
|
|
|
dates[link[:-1]] = new_date
|
|
|
|
|
|
|
|
|
|
|
|
pickle.dump(dates, open(filename[:-4] + ".pickle", 'wb'))
|
|
|
|
|