ebooks/data.py

35 lines
1 KiB
Python
Raw Normal View History

from re import sub
from html import unescape
from argparse import ArgumentParser
from psycopg2 import connect
parser = ArgumentParser()
parser.add_argument('-d', '--database', help='database connection string')
2022-02-21 18:55:27 +00:00
parser.add_argument('-o', '--output', help='output file', default='data')
args = parser.parse_args()
# Fetch messages from database since it's way faster than using the API
conn = connect(args.database)
cur = conn.cursor()
cur.execute('SELECT * FROM statuses')
statuses = cur.fetchall()
# Use regex to remove HTML stuff
text = [unescape(sub(r'<[^>]*>', ' ', status[2])) for status in statuses]
# Extract all words from statuses
words = [word for message in text for word in message.split()]
# Remove URLs and special characters and convert to lowercase
2022-02-21 20:29:48 +00:00
words = [sub(r'[^a-z0-9]', '', word.lower())
for word in words if word.find('://') == -1]
# Remove empty strings
words = [word for word in words if word != '']
with open(args.output, 'w') as f:
for word in words:
f.write(word + '\n')