mirror of
https://github.com/openai/openai-cookbook
synced 2024-11-08 01:10:29 +00:00
46 lines
1.7 KiB
Python
46 lines
1.7 KiB
Python
import os
|
|
import wget
|
|
import zipfile
|
|
import numpy as np
|
|
import pandas as pd
|
|
from ast import literal_eval
|
|
|
|
|
|
def download_wikipedia_data(
|
|
data_path: str = '../../data/',
|
|
download_path: str = "./",
|
|
file_name: str = "vector_database_wikipedia_articles_embedded") -> pd.DataFrame:
|
|
|
|
data_url = 'https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip'
|
|
|
|
csv_file_path = os.path.join(data_path, file_name + ".csv")
|
|
zip_file_path = os.path.join(download_path, file_name + ".zip")
|
|
if os.path.isfile(csv_file_path):
|
|
print("File Downloaded")
|
|
else:
|
|
if os.path.isfile(zip_file_path):
|
|
print("Zip downloaded but not unzipped, unzipping now...")
|
|
else:
|
|
print("File not found, downloading now...")
|
|
# Download the data
|
|
wget.download(data_url, out=download_path, bar=True)
|
|
|
|
# Unzip the data
|
|
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
|
|
zip_ref.extractall(data_path)
|
|
|
|
# Remove the zip file
|
|
os.remove('vector_database_wikipedia_articles_embedded.zip')
|
|
print(f"File downloaded to {data_path}")
|
|
|
|
|
|
def read_wikipedia_data(data_path: str = '../../data/', file_name: str = "vector_database_wikipedia_articles_embedded") -> pd.DataFrame:
|
|
|
|
csv_file_path = os.path.join(data_path, file_name + ".csv")
|
|
data = pd.read_csv(csv_file_path)
|
|
# Read vectors from strings back into a list
|
|
data['title_vector'] = data.title_vector.apply(literal_eval)
|
|
data['content_vector'] = data.content_vector.apply(literal_eval)
|
|
# Set vector_id to be a string
|
|
data['vector_id'] = data['vector_id'].apply(str)
|
|
return data |