ebooks/data.py

from re import sub
from html import unescape
from argparse import ArgumentParser

from psycopg2 import connect


parser = ArgumentParser()
parser.add_argument('-d', '--database', help='database connection string')
parser.add_argument('-o', '--output', help='Output file', default='data')
args = parser.parse_args()


# Fetch messages from database since it's way faster than using the API
conn = connect(args.database)
cur = conn.cursor()
cur.execute('SELECT * FROM statuses')
statuses = cur.fetchall()


# Use regex to remove HTML stuff
text = [unescape(sub(r'<[^>]*>', ' ', status[2])) for status in statuses]
# Extract all words from statuses
words = [word for message in text for word in message.split()]
# Remove URLs and special characters and convert to lowercase
words = [sub(r'[^a-z0-9]', '', word.lower()) for word in words if word.find('://') == -1]
# Remove empty strings
words = [word for word in words if word != '']


with open(args.output, 'w') as f:
    for word in words:
        f.write(word + '\n')
Clean up data.py script for generating training data 2022-02-21 18:54:35 +00:00			`from re import sub`
			`from html import unescape`
			`from argparse import ArgumentParser`

			`from psycopg2 import connect`


			`parser = ArgumentParser()`
			`parser.add_argument('-d', '--database', help='database connection string')`
			`parser.add_argument('-o', '--output', help='Output file', default='data')`
			`args = parser.parse_args()`


			`# Fetch messages from database since it's way faster than using the API`
			`conn = connect(args.database)`
			`cur = conn.cursor()`
			`cur.execute('SELECT * FROM statuses')`
			`statuses = cur.fetchall()`


			`# Use regex to remove HTML stuff`
			`text = [unescape(sub(r'<[^>]*>', ' ', status[2])) for status in statuses]`
			`# Extract all words from statuses`
			`words = [word for message in text for word in message.split()]`
			`# Remove URLs and special characters and convert to lowercase`
			`words = [sub(r'[^a-z0-9]', '', word.lower()) for word in words if word.find('://') == -1]`
			`# Remove empty strings`
			`words = [word for word in words if word != '']`


			`with open(args.output, 'w') as f:`
			`for word in words:`
			`f.write(word + '\n')`