Make each line a status instead of just a big text file of words
This commit is contained in:
parent
10c21fb1cd
commit
1c1a518fc7
20
data.py
20
data.py
|
@ -18,18 +18,16 @@ cur.execute('SELECT text FROM statuses WHERE language=\'en\'')
|
|||
statuses = cur.fetchall()
|
||||
|
||||
|
||||
# Use regex to remove HTML stuff
|
||||
text = [unescape(sub(r'<[^>]*>', '', status[0])) for status in statuses]
|
||||
# Extract all words from statuses
|
||||
words = [word for message in text for word in message.split()]
|
||||
# Remove URLs and special characters and convert to lowercase
|
||||
words = [sub(r'[^a-z0-9]', '', word.lower())
|
||||
for word in words if word.find('://') == -1]
|
||||
# Remove empty strings
|
||||
words = [word for word in words if word != '']
|
||||
for i in range(len(statuses)):
|
||||
# Remove HTML stuff
|
||||
statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0])).split()
|
||||
# Remove URLs and special characters and convert to lowercase
|
||||
statuses[i] = [sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i] if word.find('://') == -1]
|
||||
# Remove empty strings
|
||||
statuses[i] = ' '.join([word for word in statuses[i] if word != ''])
|
||||
|
||||
|
||||
# Save to output file
|
||||
with open(args.output, 'w') as f:
|
||||
for word in words:
|
||||
print(word, file=f)
|
||||
for status in statuses:
|
||||
print(status, file=f)
|
||||
|
|
Loading…
Reference in a new issue