Clean up statuses better in data.py

This commit is contained in:
Anthony Wang 2022-02-21 19:43:02 -06:00
parent 1c1a518fc7
commit 289b8b4bcb
Signed by: a
GPG key ID: BC96B00AEC5F2D76

View file

@ -18,13 +18,12 @@ cur.execute('SELECT text FROM statuses WHERE language=\'en\'')
statuses = cur.fetchall()
# Clean up statuses
for i in range(len(statuses)):
# Remove HTML stuff
statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0])).split()
# Remove URLs and special characters and convert to lowercase
statuses[i] = [sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i] if word.find('://') == -1]
# Remove empty strings
statuses[i] = ' '.join([word for word in statuses[i] if word != ''])
statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i] if word.find('://') == -1])
# Save to output file