diff --git a/data.py b/data.py index acfc64e..2ff8a59 100644 --- a/data.py +++ b/data.py @@ -21,9 +21,10 @@ statuses = cur.fetchall() # Clean up statuses for i in range(len(statuses)): # Remove HTML stuff - statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0])).split() + statuses[i] = unescape(sub(r'<[^>]*>', '', statuses[i][0])) # Remove URLs and special characters and convert to lowercase - statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i] if word.find('://') == -1]) + # Uncomment for generating data for LSTMs + #statuses[i] = ' '.join([sub(r'[^a-z0-9]', '', word.lower()) for word in statuses[i].split() if word.find('://') == -1]) # Save to output file