Modify data generation script for transformers

This commit is contained in:
Anthony Wang 2022-02-22 16:57:12 -06:00
parent a4dc9c238b
commit edd4708123
Signed by: a
GPG key ID: BC96B00AEC5F2D76

View file

@ -21,9 +21,10 @@ statuses = cur.fetchall()
# Clean up statuses
for i in range(len(statuses)):
# Remove HTML stuff
statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0])).split()
statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0]))
# Remove URLs and special characters and convert to lowercase
statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i] if word.find('://') == -1])
# Uncomment for generating data for LSTMs
#statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i].split() if word.find('://') == -1])
# Save to output file