2022-02-21 18:54:35 +00:00
|
|
|
from argparse import ArgumentParser
|
2022-02-21 21:03:28 +00:00
|
|
|
from html import unescape
|
|
|
|
from re import sub
|
2022-02-21 18:54:35 +00:00
|
|
|
|
|
|
|
from psycopg2 import connect
|
|
|
|
|
|
|
|
|
|
|
|
parser = ArgumentParser()
|
|
|
|
parser.add_argument('-d', '--database', help='database connection string')
|
2022-02-21 18:55:27 +00:00
|
|
|
parser.add_argument('-o', '--output', help='output file', default='data')
|
2022-02-21 18:54:35 +00:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
# Fetch messages from database since it's way faster than using the API
|
|
|
|
conn = connect(args.database)
|
|
|
|
cur = conn.cursor()
|
|
|
|
cur.execute('SELECT * FROM statuses')
|
|
|
|
statuses = cur.fetchall()
|
|
|
|
|
|
|
|
|
|
|
|
# Use regex to remove HTML stuff
|
|
|
|
text = [unescape(sub(r'<[^>]*>', ' ', status[2])) for status in statuses]
|
|
|
|
# Extract all words from statuses
|
|
|
|
words = [word for message in text for word in message.split()]
|
|
|
|
# Remove URLs and special characters and convert to lowercase
|
2022-02-21 20:29:48 +00:00
|
|
|
words = [sub(r'[^a-z0-9]', '', word.lower())
|
2022-02-21 21:49:29 +00:00
|
|
|
for word in words if word.find('.') == -1]
|
2022-02-21 18:54:35 +00:00
|
|
|
# Remove empty strings
|
|
|
|
words = [word for word in words if word != '']
|
|
|
|
|
|
|
|
|
2022-02-21 21:03:28 +00:00
|
|
|
# Save to output file
|
2022-02-21 18:54:35 +00:00
|
|
|
with open(args.output, 'w') as f:
|
|
|
|
for word in words:
|
|
|
|
f.write(word + '\n')
|