2023-10-26 01:47:42 +00:00
|
|
|
# Load
|
|
|
|
import uuid
|
2023-10-27 02:44:30 +00:00
|
|
|
|
|
|
|
from langchain.retrievers.multi_vector import MultiVectorRetriever
|
2023-10-26 01:47:42 +00:00
|
|
|
from langchain.storage import InMemoryStore
|
2024-01-02 20:32:16 +00:00
|
|
|
from langchain_community.chat_models import ChatOpenAI
|
|
|
|
from langchain_community.embeddings import OpenAIEmbeddings
|
2024-01-02 21:47:11 +00:00
|
|
|
from langchain_community.vectorstores import Chroma
|
docs[patch], templates[patch]: Import from core (#14575)
Update imports to use core for the low-hanging fruit changes. Ran
following
```bash
git grep -l 'langchain.schema.runnable' {docs,templates,cookbook} | xargs sed -i '' 's/langchain\.schema\.runnable/langchain_core.runnables/g'
git grep -l 'langchain.schema.output_parser' {docs,templates,cookbook} | xargs sed -i '' 's/langchain\.schema\.output_parser/langchain_core.output_parsers/g'
git grep -l 'langchain.schema.messages' {docs,templates,cookbook} | xargs sed -i '' 's/langchain\.schema\.messages/langchain_core.messages/g'
git grep -l 'langchain.schema.chat_histry' {docs,templates,cookbook} | xargs sed -i '' 's/langchain\.schema\.chat_history/langchain_core.chat_history/g'
git grep -l 'langchain.schema.prompt_template' {docs,templates,cookbook} | xargs sed -i '' 's/langchain\.schema\.prompt_template/langchain_core.prompts/g'
git grep -l 'from langchain.pydantic_v1' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.pydantic_v1/from langchain_core.pydantic_v1/g'
git grep -l 'from langchain.tools.base' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.tools\.base/from langchain_core.tools/g'
git grep -l 'from langchain.chat_models.base' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.chat_models.base/from langchain_core.language_models.chat_models/g'
git grep -l 'from langchain.llms.base' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.llms\.base\ /from langchain_core.language_models.llms\ /g'
git grep -l 'from langchain.embeddings.base' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.embeddings\.base/from langchain_core.embeddings/g'
git grep -l 'from langchain.vectorstores.base' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.vectorstores\.base/from langchain_core.vectorstores/g'
git grep -l 'from langchain.agents.tools' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.agents\.tools/from langchain_core.tools/g'
git grep -l 'from langchain.schema.output' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.schema\.output\ /from langchain_core.outputs\ /g'
git grep -l 'from langchain.schema.embeddings' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.schema\.embeddings/from langchain_core.embeddings/g'
git grep -l 'from langchain.schema.document' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.schema\.document/from langchain_core.documents/g'
git grep -l 'from langchain.schema.agent' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.schema\.agent/from langchain_core.agents/g'
git grep -l 'from langchain.schema.prompt ' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.schema\.prompt\ /from langchain_core.prompt_values /g'
git grep -l 'from langchain.schema.language_model' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.schema\.language_model/from langchain_core.language_models/g'
```
2023-12-12 00:49:10 +00:00
|
|
|
from langchain_core.documents import Document
|
|
|
|
from langchain_core.output_parsers import StrOutputParser
|
2024-01-03 21:28:05 +00:00
|
|
|
from langchain_core.prompts import ChatPromptTemplate
|
docs[patch], templates[patch]: Import from core (#14575)
Update imports to use core for the low-hanging fruit changes. Ran
following
```bash
git grep -l 'langchain.schema.runnable' {docs,templates,cookbook} | xargs sed -i '' 's/langchain\.schema\.runnable/langchain_core.runnables/g'
git grep -l 'langchain.schema.output_parser' {docs,templates,cookbook} | xargs sed -i '' 's/langchain\.schema\.output_parser/langchain_core.output_parsers/g'
git grep -l 'langchain.schema.messages' {docs,templates,cookbook} | xargs sed -i '' 's/langchain\.schema\.messages/langchain_core.messages/g'
git grep -l 'langchain.schema.chat_histry' {docs,templates,cookbook} | xargs sed -i '' 's/langchain\.schema\.chat_history/langchain_core.chat_history/g'
git grep -l 'langchain.schema.prompt_template' {docs,templates,cookbook} | xargs sed -i '' 's/langchain\.schema\.prompt_template/langchain_core.prompts/g'
git grep -l 'from langchain.pydantic_v1' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.pydantic_v1/from langchain_core.pydantic_v1/g'
git grep -l 'from langchain.tools.base' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.tools\.base/from langchain_core.tools/g'
git grep -l 'from langchain.chat_models.base' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.chat_models.base/from langchain_core.language_models.chat_models/g'
git grep -l 'from langchain.llms.base' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.llms\.base\ /from langchain_core.language_models.llms\ /g'
git grep -l 'from langchain.embeddings.base' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.embeddings\.base/from langchain_core.embeddings/g'
git grep -l 'from langchain.vectorstores.base' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.vectorstores\.base/from langchain_core.vectorstores/g'
git grep -l 'from langchain.agents.tools' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.agents\.tools/from langchain_core.tools/g'
git grep -l 'from langchain.schema.output' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.schema\.output\ /from langchain_core.outputs\ /g'
git grep -l 'from langchain.schema.embeddings' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.schema\.embeddings/from langchain_core.embeddings/g'
git grep -l 'from langchain.schema.document' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.schema\.document/from langchain_core.documents/g'
git grep -l 'from langchain.schema.agent' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.schema\.agent/from langchain_core.agents/g'
git grep -l 'from langchain.schema.prompt ' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.schema\.prompt\ /from langchain_core.prompt_values /g'
git grep -l 'from langchain.schema.language_model' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.schema\.language_model/from langchain_core.language_models/g'
```
2023-12-12 00:49:10 +00:00
|
|
|
from langchain_core.pydantic_v1 import BaseModel
|
|
|
|
from langchain_core.runnables import RunnablePassthrough
|
2023-10-26 01:47:42 +00:00
|
|
|
from unstructured.partition.pdf import partition_pdf
|
|
|
|
|
|
|
|
# Path to docs
|
|
|
|
path = "docs"
|
2023-10-27 02:44:30 +00:00
|
|
|
raw_pdf_elements = partition_pdf(
|
fix: to rag-semi-structured template (#14568)
**Description:**
Fixes to rag-semi-structured template.
- Added required libraries
- pdfminer was causing issues when installing with pip. pdfminer.six
works best
- Changed the pdf name for demo from llama2 to llava
<!-- Thank you for contributing to LangChain!
Replace this entire comment with:
- **Description:** a description of the change,
- **Issue:** the issue # it fixes (if applicable),
- **Dependencies:** any dependencies required for this change,
- **Tag maintainer:** for a quicker response, tag the relevant
maintainer (see below),
- **Twitter handle:** we announce bigger features on Twitter. If your PR
gets announced, and you'd like a mention, we'll gladly shout you out!
Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` to check this
locally.
See contribution guidelines for more information on how to write/run
tests, lint, etc:
https://github.com/langchain-ai/langchain/blob/master/.github/CONTRIBUTING.md
If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in `docs/extras`
directory.
If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
-->
2023-12-12 23:44:35 +00:00
|
|
|
filename=path + "/LLaVA.pdf",
|
2023-10-27 02:44:30 +00:00
|
|
|
# Unstructured first finds embedded image blocks
|
|
|
|
extract_images_in_pdf=False,
|
|
|
|
# Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
|
|
|
|
# Titles are any sub-section of the document
|
|
|
|
infer_table_structure=True,
|
|
|
|
# Post processing to aggregate text once we have the title
|
|
|
|
chunking_strategy="by_title",
|
|
|
|
# Chunking params to aggregate text blocks
|
|
|
|
# Attempt to create a new chunk 3800 chars
|
|
|
|
# Attempt to keep chunks > 2000 chars
|
|
|
|
max_characters=4000,
|
|
|
|
new_after_n_chars=3800,
|
|
|
|
combine_text_under_n_chars=2000,
|
|
|
|
image_output_dir_path=path,
|
|
|
|
)
|
2023-10-26 01:47:42 +00:00
|
|
|
|
|
|
|
# Categorize by type
|
|
|
|
tables = []
|
|
|
|
texts = []
|
|
|
|
for element in raw_pdf_elements:
|
|
|
|
if "unstructured.documents.elements.Table" in str(type(element)):
|
|
|
|
tables.append(str(element))
|
|
|
|
elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
|
|
|
|
texts.append(str(element))
|
|
|
|
|
|
|
|
# Summarize
|
|
|
|
|
2023-10-27 02:44:30 +00:00
|
|
|
prompt_text = """You are an assistant tasked with summarizing tables and text. \
|
2023-10-26 01:47:42 +00:00
|
|
|
Give a concise summary of the table or text. Table or text chunk: {element} """
|
2023-10-27 02:44:30 +00:00
|
|
|
prompt = ChatPromptTemplate.from_template(prompt_text)
|
|
|
|
model = ChatOpenAI(temperature=0, model="gpt-4")
|
|
|
|
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
|
2023-10-26 01:47:42 +00:00
|
|
|
|
|
|
|
# Apply
|
|
|
|
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})
|
|
|
|
# To save time / cost, only do text summaries if chunk sizes are large
|
|
|
|
# text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
|
2023-10-27 02:44:30 +00:00
|
|
|
# We can just assign text_summaries to the raw texts
|
2023-10-26 01:47:42 +00:00
|
|
|
text_summaries = texts
|
|
|
|
|
|
|
|
# Use multi vector retriever
|
|
|
|
|
|
|
|
# The vectorstore to use to index the child chunks
|
2023-10-27 02:44:30 +00:00
|
|
|
vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())
|
2023-10-26 01:47:42 +00:00
|
|
|
|
|
|
|
# The storage layer for the parent documents
|
|
|
|
store = InMemoryStore()
|
|
|
|
id_key = "doc_id"
|
|
|
|
|
|
|
|
# The retriever (empty to start)
|
|
|
|
retriever = MultiVectorRetriever(
|
2023-10-27 02:44:30 +00:00
|
|
|
vectorstore=vectorstore,
|
|
|
|
docstore=store,
|
2023-10-26 01:47:42 +00:00
|
|
|
id_key=id_key,
|
|
|
|
)
|
|
|
|
|
|
|
|
# Add texts
|
|
|
|
doc_ids = [str(uuid.uuid4()) for _ in texts]
|
2023-10-27 02:44:30 +00:00
|
|
|
summary_texts = [
|
|
|
|
Document(page_content=s, metadata={id_key: doc_ids[i]})
|
|
|
|
for i, s in enumerate(text_summaries)
|
|
|
|
]
|
2023-10-26 01:47:42 +00:00
|
|
|
retriever.vectorstore.add_documents(summary_texts)
|
|
|
|
retriever.docstore.mset(list(zip(doc_ids, texts)))
|
|
|
|
|
|
|
|
# Add tables
|
|
|
|
table_ids = [str(uuid.uuid4()) for _ in tables]
|
2023-10-27 02:44:30 +00:00
|
|
|
summary_tables = [
|
|
|
|
Document(page_content=s, metadata={id_key: table_ids[i]})
|
|
|
|
for i, s in enumerate(table_summaries)
|
|
|
|
]
|
2023-10-26 01:47:42 +00:00
|
|
|
retriever.vectorstore.add_documents(summary_tables)
|
|
|
|
retriever.docstore.mset(list(zip(table_ids, tables)))
|
|
|
|
|
|
|
|
# RAG
|
|
|
|
|
|
|
|
# Prompt template
|
|
|
|
template = """Answer the question based only on the following context, which can include text and tables:
|
|
|
|
{context}
|
|
|
|
Question: {question}
|
2023-10-27 02:44:30 +00:00
|
|
|
""" # noqa: E501
|
2023-10-26 01:47:42 +00:00
|
|
|
prompt = ChatPromptTemplate.from_template(template)
|
|
|
|
|
|
|
|
# LLM
|
2023-10-27 02:44:30 +00:00
|
|
|
model = ChatOpenAI(temperature=0, model="gpt-4")
|
2023-10-26 01:47:42 +00:00
|
|
|
|
|
|
|
# RAG pipeline
|
|
|
|
chain = (
|
2023-10-27 02:44:30 +00:00
|
|
|
{"context": retriever, "question": RunnablePassthrough()}
|
|
|
|
| prompt
|
|
|
|
| model
|
2023-10-26 01:47:42 +00:00
|
|
|
| StrOutputParser()
|
2023-10-27 02:44:30 +00:00
|
|
|
)
|
2023-11-01 00:13:44 +00:00
|
|
|
|
|
|
|
|
|
|
|
# Add typing for input
|
|
|
|
class Question(BaseModel):
|
|
|
|
__root__: str
|
|
|
|
|
|
|
|
|
|
|
|
chain = chain.with_types(input_type=Question)
|