pull/1111/head
Rasmus Storjohann 3 months ago
parent 925b905e54
commit bcfd6bdc33

@ -22,7 +22,7 @@ with open('data/noc.csv', newline='') as csvfile:
noc_data.append(record)
def include_page(page):
return page['code'].startswith('42')
return page['code'].startswith('4')
def to_page_content(page):
return 'code="' + page['code'] + '" title="' + page['title'] + '" definition="' + page['definition'] + '"'
@ -45,11 +45,11 @@ urls = [
# docs = [WebBaseLoader(url).load() for url in urls];
docs_list = [item for sublist in docs for item in sublist]
print(docs_list)
print('total documents included = ', len(docs_list))
flattened_docs = [item for sublist in docs for item in sublist]
print(flattened_docs)
print('total documents included = ', len(flattened_docs))
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
doc_splits = text_splitter.split_documents(docs_list)
doc_splits = text_splitter.split_documents(flattened_docs)
# 2. Convert documents to Embeddings and store them
vectorstore = Chroma.from_documents(

Loading…
Cancel
Save