# Load import uuid from langchain.retrievers.multi_vector import MultiVectorRetriever from langchain.storage import InMemoryStore from langchain_community.chat_models import ChatOpenAI from langchain_community.embeddings import OpenAIEmbeddings from langchain_community.vectorstores import Chroma from langchain_core.documents import Document from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import ChatPromptTemplate from langchain_core.pydantic_v1 import BaseModel from langchain_core.runnables import RunnablePassthrough from unstructured.partition.pdf import partition_pdf # Path to docs path = "docs" raw_pdf_elements = partition_pdf( filename=path + "/LLaVA.pdf", # Unstructured first finds embedded image blocks extract_images_in_pdf=False, # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles # Titles are any sub-section of the document infer_table_structure=True, # Post processing to aggregate text once we have the title chunking_strategy="by_title", # Chunking params to aggregate text blocks # Attempt to create a new chunk 3800 chars # Attempt to keep chunks > 2000 chars max_characters=4000, new_after_n_chars=3800, combine_text_under_n_chars=2000, image_output_dir_path=path, ) # Categorize by type tables = [] texts = [] for element in raw_pdf_elements: if "unstructured.documents.elements.Table" in str(type(element)): tables.append(str(element)) elif "unstructured.documents.elements.CompositeElement" in str(type(element)): texts.append(str(element)) # Summarize prompt_text = """You are an assistant tasked with summarizing tables and text. \ Give a concise summary of the table or text. Table or text chunk: {element} """ prompt = ChatPromptTemplate.from_template(prompt_text) model = ChatOpenAI(temperature=0, model="gpt-4") summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser() # Apply table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5}) # To save time / cost, only do text summaries if chunk sizes are large # text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5}) # We can just assign text_summaries to the raw texts text_summaries = texts # Use multi vector retriever # The vectorstore to use to index the child chunks vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings()) # The storage layer for the parent documents store = InMemoryStore() id_key = "doc_id" # The retriever (empty to start) retriever = MultiVectorRetriever( vectorstore=vectorstore, docstore=store, id_key=id_key, ) # Add texts doc_ids = [str(uuid.uuid4()) for _ in texts] summary_texts = [ Document(page_content=s, metadata={id_key: doc_ids[i]}) for i, s in enumerate(text_summaries) ] retriever.vectorstore.add_documents(summary_texts) retriever.docstore.mset(list(zip(doc_ids, texts))) # Add tables table_ids = [str(uuid.uuid4()) for _ in tables] summary_tables = [ Document(page_content=s, metadata={id_key: table_ids[i]}) for i, s in enumerate(table_summaries) ] retriever.vectorstore.add_documents(summary_tables) retriever.docstore.mset(list(zip(table_ids, tables))) # RAG # Prompt template template = """Answer the question based only on the following context, which can include text and tables: {context} Question: {question} """ # noqa: E501 prompt = ChatPromptTemplate.from_template(template) # LLM model = ChatOpenAI(temperature=0, model="gpt-4") # RAG pipeline chain = ( {"context": retriever, "question": RunnablePassthrough()} | prompt | model | StrOutputParser() ) # Add typing for input class Question(BaseModel): __root__: str chain = chain.with_types(input_type=Question)