Modify data generation script for transformers
This commit is contained in:
parent
a4dc9c238b
commit
edd4708123
1 changed files with 3 additions and 2 deletions
5
data.py
5
data.py
|
@ -21,9 +21,10 @@ statuses = cur.fetchall()
|
||||||
# Clean up statuses
|
# Clean up statuses
|
||||||
for i in range(len(statuses)):
|
for i in range(len(statuses)):
|
||||||
# Remove HTML stuff
|
# Remove HTML stuff
|
||||||
statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0])).split()
|
statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0]))
|
||||||
# Remove URLs and special characters and convert to lowercase
|
# Remove URLs and special characters and convert to lowercase
|
||||||
statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i] if word.find('://') == -1])
|
# Uncomment for generating data for LSTMs
|
||||||
|
#statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i].split() if word.find('://') == -1])
|
||||||
|
|
||||||
|
|
||||||
# Save to output file
|
# Save to output file
|
||||||
|
|
Loading…
Reference in a new issue