|
|
|
@ -22,7 +22,7 @@ with open('data/noc.csv', newline='') as csvfile:
|
|
|
|
|
noc_data.append(record)
|
|
|
|
|
|
|
|
|
|
def include_page(page):
|
|
|
|
|
return page['code'].startswith('42')
|
|
|
|
|
return page['code'].startswith('4')
|
|
|
|
|
|
|
|
|
|
def to_page_content(page):
|
|
|
|
|
return 'code="' + page['code'] + '" title="' + page['title'] + '" definition="' + page['definition'] + '"'
|
|
|
|
@ -45,11 +45,11 @@ urls = [
|
|
|
|
|
|
|
|
|
|
# docs = [WebBaseLoader(url).load() for url in urls];
|
|
|
|
|
|
|
|
|
|
docs_list = [item for sublist in docs for item in sublist]
|
|
|
|
|
print(docs_list)
|
|
|
|
|
print('total documents included = ', len(docs_list))
|
|
|
|
|
flattened_docs = [item for sublist in docs for item in sublist]
|
|
|
|
|
print(flattened_docs)
|
|
|
|
|
print('total documents included = ', len(flattened_docs))
|
|
|
|
|
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
|
|
|
|
|
doc_splits = text_splitter.split_documents(docs_list)
|
|
|
|
|
doc_splits = text_splitter.split_documents(flattened_docs)
|
|
|
|
|
|
|
|
|
|
# 2. Convert documents to Embeddings and store them
|
|
|
|
|
vectorstore = Chroma.from_documents(
|
|
|
|
|