|
|
@ -23,7 +23,7 @@ def download_wikipedia_data(
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
print("File not found, downloading now...")
|
|
|
|
print("File not found, downloading now...")
|
|
|
|
# Download the data
|
|
|
|
# Download the data
|
|
|
|
wget.download(data_url, out=download_path, bar=True)
|
|
|
|
wget.download(data_url, out=download_path)
|
|
|
|
|
|
|
|
|
|
|
|
# Unzip the data
|
|
|
|
# Unzip the data
|
|
|
|
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
|
|
|
|
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
|
|
|
@ -43,4 +43,4 @@ def read_wikipedia_data(data_path: str = '../../data/', file_name: str = "vector
|
|
|
|
data['content_vector'] = data.content_vector.apply(literal_eval)
|
|
|
|
data['content_vector'] = data.content_vector.apply(literal_eval)
|
|
|
|
# Set vector_id to be a string
|
|
|
|
# Set vector_id to be a string
|
|
|
|
data['vector_id'] = data['vector_id'].apply(str)
|
|
|
|
data['vector_id'] = data['vector_id'].apply(str)
|
|
|
|
return data
|
|
|
|
return data
|
|
|
|