langchain/templates/intel-rag-xeon/ingest.py

import os

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document


def ingest_documents():
    """
    Ingest PDF to Redis from the data/ directory that
    contains Edgar 10k filings data for Nike.
    """
    # Load list of pdfs
    data_path = "data/"
    doc = [os.path.join(data_path, file) for file in os.listdir(data_path)][0]

    print("Parsing 10k filing doc for NIKE", doc)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500, chunk_overlap=100, add_start_index=True
    )
    loader = UnstructuredFileLoader(doc, mode="single", strategy="fast")
    chunks = loader.load_and_split(text_splitter)

    print("Done preprocessing. Created", len(chunks), "chunks of the original pdf")

    # Create vectorstore
    embedder = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )

    documents = []
    for chunk in chunks:
        doc = Document(page_content=chunk.page_content, metadata=chunk.metadata)
        documents.append(doc)

    # Add to vectorDB
    _ = Chroma.from_documents(
        documents=documents,
        collection_name="xeon-rag",
        embedding=embedder,
        persist_directory="/tmp/xeon_rag_db",
    )


if __name__ == "__main__":
    ingest_documents()
templates: add RAG template for Intel Xeon Scalable Processors (#18424) Description: This template utilizes Chroma and TGI (Text Generation Inference) to execute RAG on the Intel Xeon Scalable Processors. It serves as a demonstration for users, illustrating the deployment of the RAG service on the Intel Xeon Scalable Processors and showcasing the resulting performance enhancements. Issue: None Dependencies: The template contains the poetry project requirements to run this template. CPU TGI batching is WIP. Twitter handle: None --------- Signed-off-by: lvliang-intel <liang1.lv@intel.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2024-03-29 21:37:32 +00:00			`import os`

			`from langchain.text_splitter import RecursiveCharacterTextSplitter`
			`from langchain_community.document_loaders import UnstructuredFileLoader`
			`from langchain_community.embeddings import HuggingFaceEmbeddings`
			`from langchain_community.vectorstores import Chroma`
			`from langchain_core.documents import Document`


			`def ingest_documents():`
			`"""`
			`Ingest PDF to Redis from the data/ directory that`
			`contains Edgar 10k filings data for Nike.`
			`"""`
			`# Load list of pdfs`
			`data_path = "data/"`
			`doc = [os.path.join(data_path, file) for file in os.listdir(data_path)][0]`

			`print("Parsing 10k filing doc for NIKE", doc)`

			`text_splitter = RecursiveCharacterTextSplitter(`
			`chunk_size=1500, chunk_overlap=100, add_start_index=True`
			`)`
			`loader = UnstructuredFileLoader(doc, mode="single", strategy="fast")`
			`chunks = loader.load_and_split(text_splitter)`

			`print("Done preprocessing. Created", len(chunks), "chunks of the original pdf")`

			`# Create vectorstore`
			`embedder = HuggingFaceEmbeddings(`
			`model_name="sentence-transformers/all-MiniLM-L6-v2"`
			`)`

			`documents = []`
			`for chunk in chunks:`
			`doc = Document(page_content=chunk.page_content, metadata=chunk.metadata)`
			`documents.append(doc)`

			`# Add to vectorDB`
			`_ = Chroma.from_documents(`
			`documents=documents,`
			`collection_name="xeon-rag",`
			`embedding=embedder,`
			`persist_directory="/tmp/xeon_rag_db",`
			`)`


			`if __name__ == "__main__":`
			`ingest_documents()`