langchain/templates/rag-semi-structured/rag_semi_structured/chain.py

# Load
import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_community.chat_models import ChatOpenAI
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel
from langchain_core.runnables import RunnablePassthrough
from unstructured.partition.pdf import partition_pdf

# Path to docs
path = "docs"
raw_pdf_elements = partition_pdf(
    filename=path + "/LLaVA.pdf",
    # Unstructured first finds embedded image blocks
    extract_images_in_pdf=False,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=path,
)

# Categorize by type
tables = []
texts = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        tables.append(str(element))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        texts.append(str(element))

# Summarize

prompt_text = """You are an assistant tasked with summarizing tables and text. \ 
Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)
model = ChatOpenAI(temperature=0, model="gpt-4")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

# Apply
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})
# To save time / cost, only do text summaries if chunk sizes are large
# text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
# We can just assign text_summaries to the raw texts
text_summaries = texts

# Use multi vector retriever

# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

# Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(text_summaries)
]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))

# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [
    Document(page_content=s, metadata={id_key: table_ids[i]})
    for i, s in enumerate(table_summaries)
]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, tables)))

# RAG

# Prompt template
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""  # noqa: E501
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = ChatOpenAI(temperature=0, model="gpt-4")

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)


# Add typing for input
class Question(BaseModel):
    __root__: str


chain = chain.with_types(input_type=Question)
Templates (#12294) Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Lance Martin <lance@langchain.dev> Co-authored-by: Jacob Lee <jacoblee93@gmail.com> 2023-10-26 01:47:42 +00:00			`# Load`
			`import uuid`
Format Templates (#12396) 2023-10-27 02:44:30 +00:00
			`from langchain.retrievers.multi_vector import MultiVectorRetriever`
Templates (#12294) Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Lance Martin <lance@langchain.dev> Co-authored-by: Jacob Lee <jacoblee93@gmail.com> 2023-10-26 01:47:42 +00:00			`from langchain.storage import InMemoryStore`
community[patch]: deprecate langchain_community Chroma in favor of langchain_chroma (#24474) 2024-07-22 15:00:13 +00:00			`from langchain_chroma import Chroma`
docs, community[patch], experimental[patch], langchain[patch], cli[pa… (#15412) …tch]: import models from community ran ```bash git grep -l 'from langchain\.chat_models' \| xargs -L 1 sed -i '' "s/from\ langchain\.chat_models/from\ langchain_community.chat_models/g" git grep -l 'from langchain\.llms' \| xargs -L 1 sed -i '' "s/from\ langchain\.llms/from\ langchain_community.llms/g" git grep -l 'from langchain\.embeddings' \| xargs -L 1 sed -i '' "s/from\ langchain\.embeddings/from\ langchain_community.embeddings/g" git checkout master libs/langchain/tests/unit_tests/llms git checkout master libs/langchain/tests/unit_tests/chat_models git checkout master libs/langchain/tests/unit_tests/embeddings/test_imports.py make format cd libs/langchain; make format cd ../experimental; make format cd ../core; make format ``` 2024-01-02 20:32:16 +00:00			`from langchain_community.chat_models import ChatOpenAI`
			`from langchain_community.embeddings import OpenAIEmbeddings`
docs[patch], templates[patch]: Import from core (#14575) Update imports to use core for the low-hanging fruit changes. Ran following ```bash git grep -l 'langchain.schema.runnable' {docs,templates,cookbook} \| xargs sed -i '' 's/langchain\.schema\.runnable/langchain_core.runnables/g' git grep -l 'langchain.schema.output_parser' {docs,templates,cookbook} \| xargs sed -i '' 's/langchain\.schema\.output_parser/langchain_core.output_parsers/g' git grep -l 'langchain.schema.messages' {docs,templates,cookbook} \| xargs sed -i '' 's/langchain\.schema\.messages/langchain_core.messages/g' git grep -l 'langchain.schema.chat_histry' {docs,templates,cookbook} \| xargs sed -i '' 's/langchain\.schema\.chat_history/langchain_core.chat_history/g' git grep -l 'langchain.schema.prompt_template' {docs,templates,cookbook} \| xargs sed -i '' 's/langchain\.schema\.prompt_template/langchain_core.prompts/g' git grep -l 'from langchain.pydantic_v1' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.pydantic_v1/from langchain_core.pydantic_v1/g' git grep -l 'from langchain.tools.base' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.tools\.base/from langchain_core.tools/g' git grep -l 'from langchain.chat_models.base' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.chat_models.base/from langchain_core.language_models.chat_models/g' git grep -l 'from langchain.llms.base' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.llms\.base\ /from langchain_core.language_models.llms\ /g' git grep -l 'from langchain.embeddings.base' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.embeddings\.base/from langchain_core.embeddings/g' git grep -l 'from langchain.vectorstores.base' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.vectorstores\.base/from langchain_core.vectorstores/g' git grep -l 'from langchain.agents.tools' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.agents\.tools/from langchain_core.tools/g' git grep -l 'from langchain.schema.output' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.output\ /from langchain_core.outputs\ /g' git grep -l 'from langchain.schema.embeddings' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.embeddings/from langchain_core.embeddings/g' git grep -l 'from langchain.schema.document' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.document/from langchain_core.documents/g' git grep -l 'from langchain.schema.agent' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.agent/from langchain_core.agents/g' git grep -l 'from langchain.schema.prompt ' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.prompt\ /from langchain_core.prompt_values /g' git grep -l 'from langchain.schema.language_model' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.language_model/from langchain_core.language_models/g' ``` 2023-12-12 00:49:10 +00:00			`from langchain_core.documents import Document`
			`from langchain_core.output_parsers import StrOutputParser`
templates: fix deps (#15439) 2024-01-03 21:28:05 +00:00			`from langchain_core.prompts import ChatPromptTemplate`
docs[patch], templates[patch]: Import from core (#14575) Update imports to use core for the low-hanging fruit changes. Ran following ```bash git grep -l 'langchain.schema.runnable' {docs,templates,cookbook} \| xargs sed -i '' 's/langchain\.schema\.runnable/langchain_core.runnables/g' git grep -l 'langchain.schema.output_parser' {docs,templates,cookbook} \| xargs sed -i '' 's/langchain\.schema\.output_parser/langchain_core.output_parsers/g' git grep -l 'langchain.schema.messages' {docs,templates,cookbook} \| xargs sed -i '' 's/langchain\.schema\.messages/langchain_core.messages/g' git grep -l 'langchain.schema.chat_histry' {docs,templates,cookbook} \| xargs sed -i '' 's/langchain\.schema\.chat_history/langchain_core.chat_history/g' git grep -l 'langchain.schema.prompt_template' {docs,templates,cookbook} \| xargs sed -i '' 's/langchain\.schema\.prompt_template/langchain_core.prompts/g' git grep -l 'from langchain.pydantic_v1' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.pydantic_v1/from langchain_core.pydantic_v1/g' git grep -l 'from langchain.tools.base' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.tools\.base/from langchain_core.tools/g' git grep -l 'from langchain.chat_models.base' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.chat_models.base/from langchain_core.language_models.chat_models/g' git grep -l 'from langchain.llms.base' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.llms\.base\ /from langchain_core.language_models.llms\ /g' git grep -l 'from langchain.embeddings.base' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.embeddings\.base/from langchain_core.embeddings/g' git grep -l 'from langchain.vectorstores.base' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.vectorstores\.base/from langchain_core.vectorstores/g' git grep -l 'from langchain.agents.tools' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.agents\.tools/from langchain_core.tools/g' git grep -l 'from langchain.schema.output' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.output\ /from langchain_core.outputs\ /g' git grep -l 'from langchain.schema.embeddings' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.embeddings/from langchain_core.embeddings/g' git grep -l 'from langchain.schema.document' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.document/from langchain_core.documents/g' git grep -l 'from langchain.schema.agent' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.agent/from langchain_core.agents/g' git grep -l 'from langchain.schema.prompt ' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.prompt\ /from langchain_core.prompt_values /g' git grep -l 'from langchain.schema.language_model' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.language_model/from langchain_core.language_models/g' ``` 2023-12-12 00:49:10 +00:00			`from langchain_core.pydantic_v1 import BaseModel`
			`from langchain_core.runnables import RunnablePassthrough`
Templates (#12294) Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Lance Martin <lance@langchain.dev> Co-authored-by: Jacob Lee <jacoblee93@gmail.com> 2023-10-26 01:47:42 +00:00			`from unstructured.partition.pdf import partition_pdf`

			`# Path to docs`
			`path = "docs"`
Format Templates (#12396) 2023-10-27 02:44:30 +00:00			`raw_pdf_elements = partition_pdf(`
fix: to rag-semi-structured template (#14568) Description: Fixes to rag-semi-structured template. - Added required libraries - pdfminer was causing issues when installing with pip. pdfminer.six works best - Changed the pdf name for demo from llama2 to llava <!-- Thank you for contributing to LangChain! Replace this entire comment with: - Description: a description of the change, - Issue: the issue # it fixes (if applicable), - Dependencies: any dependencies required for this change, - Tag maintainer: for a quicker response, tag the relevant maintainer (see below), - Twitter handle: we announce bigger features on Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out! Please make sure your PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` to check this locally. See contribution guidelines for more information on how to write/run tests, lint, etc: https://github.com/langchain-ai/langchain/blob/master/.github/CONTRIBUTING.md If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/extras` directory. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17. --> 2023-12-12 23:44:35 +00:00			`filename=path + "/LLaVA.pdf",`
Format Templates (#12396) 2023-10-27 02:44:30 +00:00			`# Unstructured first finds embedded image blocks`
			`extract_images_in_pdf=False,`
			`# Use layout model (YOLOX) to get bounding boxes (for tables) and find titles`
			`# Titles are any sub-section of the document`
			`infer_table_structure=True,`
			`# Post processing to aggregate text once we have the title`
			`chunking_strategy="by_title",`
			`# Chunking params to aggregate text blocks`
			`# Attempt to create a new chunk 3800 chars`
			`# Attempt to keep chunks > 2000 chars`
			`max_characters=4000,`
			`new_after_n_chars=3800,`
			`combine_text_under_n_chars=2000,`
			`image_output_dir_path=path,`
			`)`
Templates (#12294) Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Lance Martin <lance@langchain.dev> Co-authored-by: Jacob Lee <jacoblee93@gmail.com> 2023-10-26 01:47:42 +00:00
			`# Categorize by type`
			`tables = []`
			`texts = []`
			`for element in raw_pdf_elements:`
			`if "unstructured.documents.elements.Table" in str(type(element)):`
			`tables.append(str(element))`
			`elif "unstructured.documents.elements.CompositeElement" in str(type(element)):`
			`texts.append(str(element))`

			`# Summarize`

Format Templates (#12396) 2023-10-27 02:44:30 +00:00			`prompt_text = """You are an assistant tasked with summarizing tables and text. \`
Templates (#12294) Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Lance Martin <lance@langchain.dev> Co-authored-by: Jacob Lee <jacoblee93@gmail.com> 2023-10-26 01:47:42 +00:00			`Give a concise summary of the table or text. Table or text chunk: {element} """`
Format Templates (#12396) 2023-10-27 02:44:30 +00:00			`prompt = ChatPromptTemplate.from_template(prompt_text)`
			`model = ChatOpenAI(temperature=0, model="gpt-4")`
			`summarize_chain = {"element": lambda x: x} \| prompt \| model \| StrOutputParser()`
Templates (#12294) Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Lance Martin <lance@langchain.dev> Co-authored-by: Jacob Lee <jacoblee93@gmail.com> 2023-10-26 01:47:42 +00:00
			`# Apply`
			`table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})`
			`# To save time / cost, only do text summaries if chunk sizes are large`
			`# text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})`
Format Templates (#12396) 2023-10-27 02:44:30 +00:00			`# We can just assign text_summaries to the raw texts`
Templates (#12294) Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Lance Martin <lance@langchain.dev> Co-authored-by: Jacob Lee <jacoblee93@gmail.com> 2023-10-26 01:47:42 +00:00			`text_summaries = texts`

			`# Use multi vector retriever`

			`# The vectorstore to use to index the child chunks`
Format Templates (#12396) 2023-10-27 02:44:30 +00:00			`vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())`
Templates (#12294) Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Lance Martin <lance@langchain.dev> Co-authored-by: Jacob Lee <jacoblee93@gmail.com> 2023-10-26 01:47:42 +00:00
			`# The storage layer for the parent documents`
			`store = InMemoryStore()`
			`id_key = "doc_id"`

			`# The retriever (empty to start)`
			`retriever = MultiVectorRetriever(`
Format Templates (#12396) 2023-10-27 02:44:30 +00:00			`vectorstore=vectorstore,`
			`docstore=store,`
Templates (#12294) Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Lance Martin <lance@langchain.dev> Co-authored-by: Jacob Lee <jacoblee93@gmail.com> 2023-10-26 01:47:42 +00:00			`id_key=id_key,`
			`)`

			`# Add texts`
			`doc_ids = [str(uuid.uuid4()) for _ in texts]`
Format Templates (#12396) 2023-10-27 02:44:30 +00:00			`summary_texts = [`
			`Document(page_content=s, metadata={id_key: doc_ids[i]})`
			`for i, s in enumerate(text_summaries)`
			`]`
Templates (#12294) Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Lance Martin <lance@langchain.dev> Co-authored-by: Jacob Lee <jacoblee93@gmail.com> 2023-10-26 01:47:42 +00:00			`retriever.vectorstore.add_documents(summary_texts)`
			`retriever.docstore.mset(list(zip(doc_ids, texts)))`

			`# Add tables`
			`table_ids = [str(uuid.uuid4()) for _ in tables]`
Format Templates (#12396) 2023-10-27 02:44:30 +00:00			`summary_tables = [`
			`Document(page_content=s, metadata={id_key: table_ids[i]})`
			`for i, s in enumerate(table_summaries)`
			`]`
Templates (#12294) Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Lance Martin <lance@langchain.dev> Co-authored-by: Jacob Lee <jacoblee93@gmail.com> 2023-10-26 01:47:42 +00:00			`retriever.vectorstore.add_documents(summary_tables)`
			`retriever.docstore.mset(list(zip(table_ids, tables)))`

			`# RAG`

			`# Prompt template`
			`template = """Answer the question based only on the following context, which can include text and tables:`
			`{context}`
			`Question: {question}`
Format Templates (#12396) 2023-10-27 02:44:30 +00:00			`""" # noqa: E501`
Templates (#12294) Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Lance Martin <lance@langchain.dev> Co-authored-by: Jacob Lee <jacoblee93@gmail.com> 2023-10-26 01:47:42 +00:00			`prompt = ChatPromptTemplate.from_template(template)`

			`# LLM`
Format Templates (#12396) 2023-10-27 02:44:30 +00:00			`model = ChatOpenAI(temperature=0, model="gpt-4")`
Templates (#12294) Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Lance Martin <lance@langchain.dev> Co-authored-by: Jacob Lee <jacoblee93@gmail.com> 2023-10-26 01:47:42 +00:00
			`# RAG pipeline`
			`chain = (`
Format Templates (#12396) 2023-10-27 02:44:30 +00:00			`{"context": retriever, "question": RunnablePassthrough()}`
			`\| prompt`
			`\| model`
Templates (#12294) Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Lance Martin <lance@langchain.dev> Co-authored-by: Jacob Lee <jacoblee93@gmail.com> 2023-10-26 01:47:42 +00:00			`\| StrOutputParser()`
Format Templates (#12396) 2023-10-27 02:44:30 +00:00			`)`
Add RAG input types (#12684) Co-authored-by: Erick Friis <erick@langchain.dev> 2023-11-01 00:13:44 +00:00

			`# Add typing for input`
			`class Question(BaseModel):`
			`__root__: str`


			`chain = chain.with_types(input_type=Question)`