import os import pickle import copy import numpy as np CODES = {'': 0, '': 1, '': 2, '': 3 } def load_data(path): """ Load Dataset from File """ input_file = os.path.join(path) with open(input_file, 'r', encoding='utf-8') as f: return f.read() def preprocess_and_save_data(source_path, target_path, text_to_ids): """ Preprocess Text Data. Save to to file. """ # Preprocess source_text = load_data(source_path) target_text = load_data(target_path) source_text = source_text.lower() target_text = target_text.lower() source_vocab_to_int, source_int_to_vocab = create_lookup_tables(source_text) target_vocab_to_int, target_int_to_vocab = create_lookup_tables(target_text) source_text, target_text = text_to_ids(source_text, target_text, source_vocab_to_int, target_vocab_to_int) # Save Data with open('/output/preprocess.p', 'wb') as out_file: pickle.dump(( (source_text, target_text), (source_vocab_to_int, target_vocab_to_int), (source_int_to_vocab, target_int_to_vocab)), out_file) def load_preprocess(): """ Load the Preprocessed Training data and return them in batches of or less """ with open('/output/preprocess.p', mode='rb') as in_file: return pickle.load(in_file) def create_lookup_tables(text): """ Create lookup tables for vocabulary """ vocab = set(text.split()) vocab_to_int = copy.copy(CODES) for v_i, v in enumerate(vocab, len(CODES)): vocab_to_int[v] = v_i int_to_vocab = {v_i: v for v, v_i in vocab_to_int.items()} return vocab_to_int, int_to_vocab def save_params(params): """ Save parameters to file """ with open('/output/params.p', 'wb') as out_file: pickle.dump(params, out_file) def load_params(): """ Load parameters from file """ with open('/output/params.p', mode='rb') as in_file: return pickle.load(in_file) def batch_data(source, target, batch_size): """ Batch source and target together """ for batch_i in range(0, len(source)//batch_size): start_i = batch_i * batch_size source_batch = source[start_i:start_i + batch_size] target_batch = target[start_i:start_i + batch_size] yield np.array(pad_sentence_batch(source_batch)), np.array(pad_sentence_batch(target_batch)) def pad_sentence_batch(sentence_batch): """ Pad sentence with id """ max_sentence = max([len(sentence) for sentence in sentence_batch]) return [sentence + [CODES['']] * (max_sentence - len(sentence)) for sentence in sentence_batch]