mirror of
https://github.com/hwchase17/langchain
synced 2024-11-18 09:25:54 +00:00
50 lines
1.5 KiB
Python
50 lines
1.5 KiB
Python
|
import os
|
||
|
|
||
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||
|
from langchain_community.document_loaders import UnstructuredFileLoader
|
||
|
from langchain_community.embeddings import HuggingFaceEmbeddings
|
||
|
from langchain_community.vectorstores import Chroma
|
||
|
from langchain_core.documents import Document
|
||
|
|
||
|
|
||
|
def ingest_documents():
|
||
|
"""
|
||
|
Ingest PDF to Redis from the data/ directory that
|
||
|
contains Edgar 10k filings data for Nike.
|
||
|
"""
|
||
|
# Load list of pdfs
|
||
|
data_path = "data/"
|
||
|
doc = [os.path.join(data_path, file) for file in os.listdir(data_path)][0]
|
||
|
|
||
|
print("Parsing 10k filing doc for NIKE", doc)
|
||
|
|
||
|
text_splitter = RecursiveCharacterTextSplitter(
|
||
|
chunk_size=1500, chunk_overlap=100, add_start_index=True
|
||
|
)
|
||
|
loader = UnstructuredFileLoader(doc, mode="single", strategy="fast")
|
||
|
chunks = loader.load_and_split(text_splitter)
|
||
|
|
||
|
print("Done preprocessing. Created", len(chunks), "chunks of the original pdf")
|
||
|
|
||
|
# Create vectorstore
|
||
|
embedder = HuggingFaceEmbeddings(
|
||
|
model_name="sentence-transformers/all-MiniLM-L6-v2"
|
||
|
)
|
||
|
|
||
|
documents = []
|
||
|
for chunk in chunks:
|
||
|
doc = Document(page_content=chunk.page_content, metadata=chunk.metadata)
|
||
|
documents.append(doc)
|
||
|
|
||
|
# Add to vectorDB
|
||
|
_ = Chroma.from_documents(
|
||
|
documents=documents,
|
||
|
collection_name="xeon-rag",
|
||
|
embedding=embedder,
|
||
|
persist_directory="/tmp/xeon_rag_db",
|
||
|
)
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
ingest_documents()
|