Ignore URLs by ignoring strings with .
This commit is contained in:
parent
176f6f306b
commit
13c502ef53
2
data.py
2
data.py
|
@ -24,7 +24,7 @@ text = [unescape(sub(r'<[^>]*>', ' ', status[2])) for status in statuses]
|
|||
words = [word for message in text for word in message.split()]
|
||||
# Remove URLs and special characters and convert to lowercase
|
||||
words = [sub(r'[^a-z0-9]', '', word.lower())
|
||||
for word in words if word.find('://') == -1]
|
||||
for word in words if word.find('.') == -1]
|
||||
# Remove empty strings
|
||||
words = [word for word in words if word != '']
|
||||
|
||||
|
|
Loading…
Reference in a new issue