2023-10-28 15:31:12 +00:00
|
|
|
import os
|
|
|
|
|
2024-01-02 21:47:11 +00:00
|
|
|
from langchain_community.document_loaders import UnstructuredFileLoader
|
2024-01-02 20:32:16 +00:00
|
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings
|
2024-01-02 21:47:11 +00:00
|
|
|
from langchain_community.vectorstores import Redis
|
2024-03-01 02:33:21 +00:00
|
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
2023-10-28 15:31:12 +00:00
|
|
|
from rag_redis.config import EMBED_MODEL, INDEX_NAME, INDEX_SCHEMA, REDIS_URL
|
|
|
|
|
|
|
|
|
|
|
|
def ingest_documents():
|
|
|
|
"""
|
|
|
|
Ingest PDF to Redis from the data/ directory that
|
|
|
|
contains Edgar 10k filings data for Nike.
|
|
|
|
"""
|
|
|
|
# Load list of pdfs
|
|
|
|
company_name = "Nike"
|
|
|
|
data_path = "data/"
|
2023-10-29 22:50:09 +00:00
|
|
|
doc = [os.path.join(data_path, file) for file in os.listdir(data_path)][0]
|
2023-10-28 15:31:12 +00:00
|
|
|
|
2024-05-22 22:21:08 +00:00
|
|
|
print("Parsing 10k filing doc for NIKE", doc)
|
2023-10-28 15:31:12 +00:00
|
|
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(
|
|
|
|
chunk_size=1500, chunk_overlap=100, add_start_index=True
|
|
|
|
)
|
|
|
|
loader = UnstructuredFileLoader(doc, mode="single", strategy="fast")
|
|
|
|
chunks = loader.load_and_split(text_splitter)
|
|
|
|
|
2024-05-22 22:21:08 +00:00
|
|
|
print("Done preprocessing. Created", len(chunks), "chunks of the original pdf")
|
2023-10-28 15:31:12 +00:00
|
|
|
# Create vectorstore
|
2023-10-29 22:50:09 +00:00
|
|
|
embedder = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
|
2023-10-28 15:31:12 +00:00
|
|
|
|
|
|
|
_ = Redis.from_texts(
|
|
|
|
# appending this little bit can sometimes help with semantic retrieval
|
|
|
|
# especially with multiple companies
|
|
|
|
texts=[f"Company: {company_name}. " + chunk.page_content for chunk in chunks],
|
|
|
|
metadatas=[chunk.metadata for chunk in chunks],
|
|
|
|
embedding=embedder,
|
|
|
|
index_name=INDEX_NAME,
|
|
|
|
index_schema=INDEX_SCHEMA,
|
2023-10-29 22:50:09 +00:00
|
|
|
redis_url=REDIS_URL,
|
2023-10-28 15:31:12 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
ingest_documents()
|