2023-11-03 20:22:55 +00:00
|
|
|
from pathlib import Path
|
|
|
|
from typing import List
|
|
|
|
|
2024-01-02 21:47:11 +00:00
|
|
|
from langchain_community.document_loaders import TextLoader
|
|
|
|
from langchain_community.graphs import Neo4jGraph
|
2024-01-03 21:28:05 +00:00
|
|
|
from langchain_core.prompts import ChatPromptTemplate
|
docs[patch], templates[patch]: Import from core (#14575)
Update imports to use core for the low-hanging fruit changes. Ran
following
```bash
git grep -l 'langchain.schema.runnable' {docs,templates,cookbook} | xargs sed -i '' 's/langchain\.schema\.runnable/langchain_core.runnables/g'
git grep -l 'langchain.schema.output_parser' {docs,templates,cookbook} | xargs sed -i '' 's/langchain\.schema\.output_parser/langchain_core.output_parsers/g'
git grep -l 'langchain.schema.messages' {docs,templates,cookbook} | xargs sed -i '' 's/langchain\.schema\.messages/langchain_core.messages/g'
git grep -l 'langchain.schema.chat_histry' {docs,templates,cookbook} | xargs sed -i '' 's/langchain\.schema\.chat_history/langchain_core.chat_history/g'
git grep -l 'langchain.schema.prompt_template' {docs,templates,cookbook} | xargs sed -i '' 's/langchain\.schema\.prompt_template/langchain_core.prompts/g'
git grep -l 'from langchain.pydantic_v1' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.pydantic_v1/from langchain_core.pydantic_v1/g'
git grep -l 'from langchain.tools.base' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.tools\.base/from langchain_core.tools/g'
git grep -l 'from langchain.chat_models.base' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.chat_models.base/from langchain_core.language_models.chat_models/g'
git grep -l 'from langchain.llms.base' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.llms\.base\ /from langchain_core.language_models.llms\ /g'
git grep -l 'from langchain.embeddings.base' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.embeddings\.base/from langchain_core.embeddings/g'
git grep -l 'from langchain.vectorstores.base' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.vectorstores\.base/from langchain_core.vectorstores/g'
git grep -l 'from langchain.agents.tools' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.agents\.tools/from langchain_core.tools/g'
git grep -l 'from langchain.schema.output' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.schema\.output\ /from langchain_core.outputs\ /g'
git grep -l 'from langchain.schema.embeddings' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.schema\.embeddings/from langchain_core.embeddings/g'
git grep -l 'from langchain.schema.document' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.schema\.document/from langchain_core.documents/g'
git grep -l 'from langchain.schema.agent' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.schema\.agent/from langchain_core.agents/g'
git grep -l 'from langchain.schema.prompt ' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.schema\.prompt\ /from langchain_core.prompt_values /g'
git grep -l 'from langchain.schema.language_model' {docs,templates,cookbook} | xargs sed -i '' 's/from langchain\.schema\.language_model/from langchain_core.language_models/g'
```
2023-12-12 00:49:10 +00:00
|
|
|
from langchain_core.pydantic_v1 import BaseModel, Field
|
2024-03-30 14:40:05 +00:00
|
|
|
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
2024-03-01 02:33:21 +00:00
|
|
|
from langchain_text_splitters import TokenTextSplitter
|
2023-11-03 20:22:55 +00:00
|
|
|
from neo4j.exceptions import ClientError
|
|
|
|
|
|
|
|
txt_path = Path(__file__).parent / "dune.txt"
|
|
|
|
|
|
|
|
graph = Neo4jGraph()
|
|
|
|
|
|
|
|
# Embeddings & LLM models
|
|
|
|
embeddings = OpenAIEmbeddings()
|
|
|
|
embedding_dimension = 1536
|
|
|
|
llm = ChatOpenAI(temperature=0)
|
|
|
|
|
|
|
|
# Load the text file
|
|
|
|
loader = TextLoader(str(txt_path))
|
|
|
|
documents = loader.load()
|
|
|
|
|
|
|
|
# Ingest Parent-Child node pairs
|
|
|
|
parent_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
|
|
|
|
child_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=24)
|
|
|
|
parent_documents = parent_splitter.split_documents(documents)
|
|
|
|
|
|
|
|
for i, parent in enumerate(parent_documents):
|
|
|
|
child_documents = child_splitter.split_documents([parent])
|
|
|
|
params = {
|
|
|
|
"parent_text": parent.page_content,
|
|
|
|
"parent_id": i,
|
|
|
|
"parent_embedding": embeddings.embed_query(parent.page_content),
|
|
|
|
"children": [
|
|
|
|
{
|
|
|
|
"text": c.page_content,
|
|
|
|
"id": f"{i}-{ic}",
|
|
|
|
"embedding": embeddings.embed_query(c.page_content),
|
|
|
|
}
|
|
|
|
for ic, c in enumerate(child_documents)
|
|
|
|
],
|
|
|
|
}
|
|
|
|
# Ingest data
|
|
|
|
graph.query(
|
|
|
|
"""
|
|
|
|
MERGE (p:Parent {id: $parent_id})
|
|
|
|
SET p.text = $parent_text
|
|
|
|
WITH p
|
|
|
|
CALL db.create.setVectorProperty(p, 'embedding', $parent_embedding)
|
|
|
|
YIELD node
|
|
|
|
WITH p
|
|
|
|
UNWIND $children AS child
|
|
|
|
MERGE (c:Child {id: child.id})
|
|
|
|
SET c.text = child.text
|
|
|
|
MERGE (c)<-[:HAS_CHILD]-(p)
|
|
|
|
WITH c, child
|
|
|
|
CALL db.create.setVectorProperty(c, 'embedding', child.embedding)
|
|
|
|
YIELD node
|
|
|
|
RETURN count(*)
|
|
|
|
""",
|
|
|
|
params,
|
|
|
|
)
|
|
|
|
# Create vector index for child
|
|
|
|
try:
|
|
|
|
graph.query(
|
|
|
|
"CALL db.index.vector.createNodeIndex('parent_document', "
|
|
|
|
"'Child', 'embedding', $dimension, 'cosine')",
|
|
|
|
{"dimension": embedding_dimension},
|
|
|
|
)
|
|
|
|
except ClientError: # already exists
|
|
|
|
pass
|
|
|
|
# Create vector index for parents
|
|
|
|
try:
|
|
|
|
graph.query(
|
|
|
|
"CALL db.index.vector.createNodeIndex('typical_rag', "
|
|
|
|
"'Parent', 'embedding', $dimension, 'cosine')",
|
|
|
|
{"dimension": embedding_dimension},
|
|
|
|
)
|
|
|
|
except ClientError: # already exists
|
|
|
|
pass
|
|
|
|
# Ingest hypothethical questions
|
|
|
|
|
|
|
|
|
|
|
|
class Questions(BaseModel):
|
|
|
|
"""Generating hypothetical questions about text."""
|
|
|
|
|
|
|
|
questions: List[str] = Field(
|
|
|
|
...,
|
|
|
|
description=(
|
|
|
|
"Generated hypothetical questions based on " "the information from the text"
|
|
|
|
),
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
questions_prompt = ChatPromptTemplate.from_messages(
|
|
|
|
[
|
|
|
|
(
|
|
|
|
"system",
|
|
|
|
(
|
|
|
|
"You are generating hypothetical questions based on the information "
|
|
|
|
"found in the text. Make sure to provide full context in the generated "
|
|
|
|
"questions."
|
|
|
|
),
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"human",
|
|
|
|
(
|
|
|
|
"Use the given format to generate hypothetical questions from the "
|
|
|
|
"following input: {input}"
|
|
|
|
),
|
|
|
|
),
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
2024-03-30 14:40:05 +00:00
|
|
|
question_chain = questions_prompt | llm.with_structured_output(Questions)
|
2023-11-03 20:22:55 +00:00
|
|
|
|
|
|
|
for i, parent in enumerate(parent_documents):
|
2024-03-30 14:40:05 +00:00
|
|
|
questions = question_chain.invoke(parent.page_content).questions
|
2023-11-03 20:22:55 +00:00
|
|
|
params = {
|
|
|
|
"parent_id": i,
|
|
|
|
"questions": [
|
|
|
|
{"text": q, "id": f"{i}-{iq}", "embedding": embeddings.embed_query(q)}
|
|
|
|
for iq, q in enumerate(questions)
|
|
|
|
if q
|
|
|
|
],
|
|
|
|
}
|
|
|
|
graph.query(
|
|
|
|
"""
|
|
|
|
MERGE (p:Parent {id: $parent_id})
|
|
|
|
WITH p
|
|
|
|
UNWIND $questions AS question
|
|
|
|
CREATE (q:Question {id: question.id})
|
|
|
|
SET q.text = question.text
|
|
|
|
MERGE (q)<-[:HAS_QUESTION]-(p)
|
|
|
|
WITH q, question
|
|
|
|
CALL db.create.setVectorProperty(q, 'embedding', question.embedding)
|
|
|
|
YIELD node
|
|
|
|
RETURN count(*)
|
|
|
|
""",
|
|
|
|
params,
|
|
|
|
)
|
|
|
|
# Create vector index
|
|
|
|
try:
|
|
|
|
graph.query(
|
|
|
|
"CALL db.index.vector.createNodeIndex('hypothetical_questions', "
|
|
|
|
"'Question', 'embedding', $dimension, 'cosine')",
|
|
|
|
{"dimension": embedding_dimension},
|
|
|
|
)
|
|
|
|
except ClientError: # already exists
|
|
|
|
pass
|
|
|
|
|
|
|
|
# Ingest summaries
|
|
|
|
|
|
|
|
summary_prompt = ChatPromptTemplate.from_messages(
|
|
|
|
[
|
|
|
|
(
|
|
|
|
"system",
|
|
|
|
(
|
|
|
|
"You are generating concise and accurate summaries based on the "
|
|
|
|
"information found in the text."
|
|
|
|
),
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"human",
|
|
|
|
("Generate a summary of the following input: {question}\n" "Summary:"),
|
|
|
|
),
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
summary_chain = summary_prompt | llm
|
|
|
|
|
|
|
|
for i, parent in enumerate(parent_documents):
|
|
|
|
summary = summary_chain.invoke({"question": parent.page_content}).content
|
|
|
|
params = {
|
|
|
|
"parent_id": i,
|
|
|
|
"summary": summary,
|
|
|
|
"embedding": embeddings.embed_query(summary),
|
|
|
|
}
|
|
|
|
graph.query(
|
|
|
|
"""
|
|
|
|
MERGE (p:Parent {id: $parent_id})
|
|
|
|
MERGE (p)-[:HAS_SUMMARY]->(s:Summary)
|
|
|
|
SET s.text = $summary
|
|
|
|
WITH s
|
|
|
|
CALL db.create.setVectorProperty(s, 'embedding', $embedding)
|
|
|
|
YIELD node
|
|
|
|
RETURN count(*)
|
|
|
|
""",
|
|
|
|
params,
|
|
|
|
)
|
|
|
|
# Create vector index
|
|
|
|
try:
|
|
|
|
graph.query(
|
|
|
|
"CALL db.index.vector.createNodeIndex('summary', "
|
|
|
|
"'Summary', 'embedding', $dimension, 'cosine')",
|
|
|
|
{"dimension": embedding_dimension},
|
|
|
|
)
|
|
|
|
except ClientError: # already exists
|
|
|
|
pass
|