Ignore URLs by ignoring strings with .

This commit is contained in:
Anthony Wang 2022-02-21 15:49:29 -06:00
parent 176f6f306b
commit 13c502ef53
Signed by: a
GPG key ID: BC96B00AEC5F2D76

View file

@ -24,7 +24,7 @@ text = [unescape(sub(r'<[^>]*>', ' ', status[2])) for status in statuses]
words = [word for message in text for word in message.split()]
# Remove URLs and special characters and convert to lowercase
words = [sub(r'[^a-z0-9]', '', word.lower())
for word in words if word.find('://') == -1]
for word in words if word.find('.') == -1]
# Remove empty strings
words = [word for word in words if word != '']