Modify data generation script for transformers
This commit is contained in:
parent
a4dc9c238b
commit
edd4708123
5
data.py
5
data.py
|
@ -21,9 +21,10 @@ statuses = cur.fetchall()
|
|||
# Clean up statuses
|
||||
for i in range(len(statuses)):
|
||||
# Remove HTML stuff
|
||||
statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0])).split()
|
||||
statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0]))
|
||||
# Remove URLs and special characters and convert to lowercase
|
||||
statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i] if word.find('://') == -1])
|
||||
# Uncomment for generating data for LSTMs
|
||||
#statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i].split() if word.find('://') == -1])
|
||||
|
||||
|
||||
# Save to output file
|
||||
|
|
Loading…
Reference in a new issue