You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
59 lines
2.1 KiB
Python
59 lines
2.1 KiB
Python
import re
|
|
from collections import Counter
|
|
|
|
def preprocess(text):
|
|
|
|
# Replace punctuation with tokens so we can use them in our model
|
|
text = text.lower()
|
|
text = text.replace('.', ' <PERIOD> ')
|
|
text = text.replace(',', ' <COMMA> ')
|
|
text = text.replace('"', ' <QUOTATION_MARK> ')
|
|
text = text.replace(';', ' <SEMICOLON> ')
|
|
text = text.replace('!', ' <EXCLAMATION_MARK> ')
|
|
text = text.replace('?', ' <QUESTION_MARK> ')
|
|
text = text.replace('(', ' <LEFT_PAREN> ')
|
|
text = text.replace(')', ' <RIGHT_PAREN> ')
|
|
text = text.replace('--', ' <HYPHENS> ')
|
|
text = text.replace('?', ' <QUESTION_MARK> ')
|
|
# text = text.replace('\n', ' <NEW_LINE> ')
|
|
text = text.replace(':', ' <COLON> ')
|
|
words = text.split()
|
|
|
|
# Remove all words with 5 or fewer occurences
|
|
word_counts = Counter(words)
|
|
trimmed_words = [word for word in words if word_counts[word] > 5]
|
|
|
|
return trimmed_words
|
|
|
|
def get_batches(int_text, batch_size, seq_length):
|
|
"""
|
|
Return batches of input and target
|
|
:param int_text: Text with the words replaced by their ids
|
|
:param batch_size: The size of batch
|
|
:param seq_length: The length of sequence
|
|
:return: A list where each item is a tuple of (batch of input, batch of target).
|
|
"""
|
|
n_batches = int(len(int_text) / (batch_size * seq_length))
|
|
|
|
# Drop the last few characters to make only full batches
|
|
xdata = np.array(int_text[: n_batches * batch_size * seq_length])
|
|
ydata = np.array(int_text[1: n_batches * batch_size * seq_length + 1])
|
|
|
|
x_batches = np.split(xdata.reshape(batch_size, -1), n_batches, 1)
|
|
y_batches = np.split(ydata.reshape(batch_size, -1), n_batches, 1)
|
|
|
|
return list(zip(x_batches, y_batches))
|
|
|
|
|
|
def create_lookup_tables(words):
|
|
"""
|
|
Create lookup tables for vocabulary
|
|
:param words: Input list of words
|
|
:return: A tuple of dicts. The first dict....
|
|
"""
|
|
word_counts = Counter(words)
|
|
sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
|
|
int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
|
|
vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}
|
|
|
|
return vocab_to_int, int_to_vocab |