From a4992ffadaf8a62c0a9f174de71b5f1a624a90cc Mon Sep 17 00:00:00 2001 From: Shaurya Rohatgi Date: Tue, 12 Dec 2023 18:44:35 -0500 Subject: [PATCH] fix: to rag-semi-structured template (#14568) **Description:** Fixes to rag-semi-structured template. - Added required libraries - pdfminer was causing issues when installing with pip. pdfminer.six works best - Changed the pdf name for demo from llama2 to llava --- templates/rag-semi-structured/pyproject.toml | 8 +++++++- .../rag-semi-structured/rag_semi_structured/chain.py | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/templates/rag-semi-structured/pyproject.toml b/templates/rag-semi-structured/pyproject.toml index 78c6c74829..de10e6e57e 100644 --- a/templates/rag-semi-structured/pyproject.toml +++ b/templates/rag-semi-structured/pyproject.toml @@ -8,7 +8,7 @@ authors = [ readme = "README.md" [tool.poetry.dependencies] -python = ">=3.8.1,<4.0" +python = ">=3.9,<3.11" langchain = ">=0.0.325" tiktoken = ">=0.5.1" chromadb = ">=0.4.14" @@ -16,6 +16,12 @@ openai = "<2" unstructured = ">=0.10.19" pdf2image = ">=1.16.3" pdfminer = "^20191125" +opencv-python = "^4.8.1.78" +pandas = "^2.1.4" +pytesseract = "^0.3.10" +pdfminer-six = "^20221105" +unstructured-pytesseract = "^0.3.12" +unstructured-inference = "^0.7.18" [tool.poetry.group.dev.dependencies] langchain-cli = ">=0.0.15" diff --git a/templates/rag-semi-structured/rag_semi_structured/chain.py b/templates/rag-semi-structured/rag_semi_structured/chain.py index 488a753c88..8c74db3d07 100644 --- a/templates/rag-semi-structured/rag_semi_structured/chain.py +++ b/templates/rag-semi-structured/rag_semi_structured/chain.py @@ -16,7 +16,7 @@ from unstructured.partition.pdf import partition_pdf # Path to docs path = "docs" raw_pdf_elements = partition_pdf( - filename=path + "LLaMA2.pdf", + filename=path + "/LLaVA.pdf", # Unstructured first finds embedded image blocks extract_images_in_pdf=False, # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles