import re from collections import Counter def preprocess(text): # Replace punctuation with tokens so we can use them in our model text = text.lower() text = text.replace('.', ' ') text = text.replace(',', ' ') text = text.replace('"', ' ') text = text.replace(';', ' ') text = text.replace('!', ' ') text = text.replace('?', ' ') text = text.replace('(', ' ') text = text.replace(')', ' ') text = text.replace('--', ' ') text = text.replace('?', ' ') # text = text.replace('\n', ' ') text = text.replace(':', ' ') words = text.split() # Remove all words with 5 or fewer occurences word_counts = Counter(words) trimmed_words = [word for word in words if word_counts[word] > 5] return trimmed_words def get_batches(int_text, batch_size, seq_length): """ Return batches of input and target :param int_text: Text with the words replaced by their ids :param batch_size: The size of batch :param seq_length: The length of sequence :return: A list where each item is a tuple of (batch of input, batch of target). """ n_batches = int(len(int_text) / (batch_size * seq_length)) # Drop the last few characters to make only full batches xdata = np.array(int_text[: n_batches * batch_size * seq_length]) ydata = np.array(int_text[1: n_batches * batch_size * seq_length + 1]) x_batches = np.split(xdata.reshape(batch_size, -1), n_batches, 1) y_batches = np.split(ydata.reshape(batch_size, -1), n_batches, 1) return list(zip(x_batches, y_batches)) def create_lookup_tables(words): """ Create lookup tables for vocabulary :param words: Input list of words :return: A tuple of dicts. The first dict.... """ word_counts = Counter(words) sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True) int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)} vocab_to_int = {word: ii for ii, word in int_to_vocab.items()} return vocab_to_int, int_to_vocab