diff --git a/templates/rag-semi-structured/pyproject.toml b/templates/rag-semi-structured/pyproject.toml index 78c6c74829..de10e6e57e 100644 --- a/templates/rag-semi-structured/pyproject.toml +++ b/templates/rag-semi-structured/pyproject.toml @@ -8,7 +8,7 @@ authors = [ readme = "README.md" [tool.poetry.dependencies] -python = ">=3.8.1,<4.0" +python = ">=3.9,<3.11" langchain = ">=0.0.325" tiktoken = ">=0.5.1" chromadb = ">=0.4.14" @@ -16,6 +16,12 @@ openai = "<2" unstructured = ">=0.10.19" pdf2image = ">=1.16.3" pdfminer = "^20191125" +opencv-python = "^4.8.1.78" +pandas = "^2.1.4" +pytesseract = "^0.3.10" +pdfminer-six = "^20221105" +unstructured-pytesseract = "^0.3.12" +unstructured-inference = "^0.7.18" [tool.poetry.group.dev.dependencies] langchain-cli = ">=0.0.15" diff --git a/templates/rag-semi-structured/rag_semi_structured/chain.py b/templates/rag-semi-structured/rag_semi_structured/chain.py index 488a753c88..8c74db3d07 100644 --- a/templates/rag-semi-structured/rag_semi_structured/chain.py +++ b/templates/rag-semi-structured/rag_semi_structured/chain.py @@ -16,7 +16,7 @@ from unstructured.partition.pdf import partition_pdf # Path to docs path = "docs" raw_pdf_elements = partition_pdf( - filename=path + "LLaMA2.pdf", + filename=path + "/LLaVA.pdf", # Unstructured first finds embedded image blocks extract_images_in_pdf=False, # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles