import os from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import UnstructuredFileLoader from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma from langchain_core.documents import Document def ingest_documents(): """ Ingest PDF to Redis from the data/ directory that contains Edgar 10k filings data for Nike. """ # Load list of pdfs data_path = "data/" doc = [os.path.join(data_path, file) for file in os.listdir(data_path)][0] print("Parsing 10k filing doc for NIKE", doc) text_splitter = RecursiveCharacterTextSplitter( chunk_size=1500, chunk_overlap=100, add_start_index=True ) loader = UnstructuredFileLoader(doc, mode="single", strategy="fast") chunks = loader.load_and_split(text_splitter) print("Done preprocessing. Created", len(chunks), "chunks of the original pdf") # Create vectorstore embedder = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2" ) documents = [] for chunk in chunks: doc = Document(page_content=chunk.page_content, metadata=chunk.metadata) documents.append(doc) # Add to vectorDB _ = Chroma.from_documents( documents=documents, collection_name="xeon-rag", embedding=embedder, persist_directory="/tmp/xeon_rag_db", ) if __name__ == "__main__": ingest_documents()