mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
ed58eeb9c5
Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
533 lines
19 KiB
Python
533 lines
19 KiB
Python
import json
|
||
import logging
|
||
import numbers
|
||
from hashlib import sha1
|
||
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||
|
||
from langchain_core.documents import Document
|
||
from langchain_core.embeddings import Embeddings
|
||
from langchain_core.vectorstores import VectorStore
|
||
|
||
logger = logging.getLogger()
|
||
|
||
|
||
class AlibabaCloudOpenSearchSettings:
|
||
"""Alibaba Cloud Opensearch` client configuration.
|
||
|
||
Attribute:
|
||
endpoint (str) : The endpoint of opensearch instance, You can find it
|
||
from the console of Alibaba Cloud OpenSearch.
|
||
instance_id (str) : The identify of opensearch instance, You can find
|
||
it from the console of Alibaba Cloud OpenSearch.
|
||
username (str) : The username specified when purchasing the instance.
|
||
password (str) : The password specified when purchasing the instance,
|
||
After the instance is created, you can modify it on the console.
|
||
tablename (str): The table name specified during instance configuration.
|
||
field_name_mapping (Dict) : Using field name mapping between opensearch
|
||
vector store and opensearch instance configuration table field names:
|
||
{
|
||
'id': 'The id field name map of index document.',
|
||
'document': 'The text field name map of index document.',
|
||
'embedding': 'In the embedding field of the opensearch instance,
|
||
the values must be in float type and separated by separator,
|
||
default is comma.',
|
||
'metadata_field_x': 'Metadata field mapping includes the mapped
|
||
field name and operator in the mapping value, separated by a comma
|
||
between the mapped field name and the operator.',
|
||
}
|
||
protocol (str): Communication Protocol between SDK and Server, default is http.
|
||
namespace (str) : The instance data will be partitioned based on the "namespace"
|
||
field,If the namespace is enabled, you need to specify the namespace field
|
||
name during initialization, Otherwise, the queries cannot be executed
|
||
correctly.
|
||
embedding_field_separator(str): Delimiter specified for writing vector
|
||
field data, default is comma.
|
||
output_fields: Specify the field list returned when invoking OpenSearch,
|
||
by default it is the value list of the field mapping field.
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
endpoint: str,
|
||
instance_id: str,
|
||
username: str,
|
||
password: str,
|
||
table_name: str,
|
||
field_name_mapping: Dict[str, str],
|
||
protocol: str = "http",
|
||
namespace: str = "",
|
||
embedding_field_separator: str = ",",
|
||
output_fields: Optional[List[str]] = None,
|
||
) -> None:
|
||
self.endpoint = endpoint
|
||
self.instance_id = instance_id
|
||
self.protocol = protocol
|
||
self.username = username
|
||
self.password = password
|
||
self.namespace = namespace
|
||
self.table_name = table_name
|
||
self.opt_table_name = "_".join([self.instance_id, self.table_name])
|
||
self.field_name_mapping = field_name_mapping
|
||
self.embedding_field_separator = embedding_field_separator
|
||
if output_fields is None:
|
||
self.output_fields = [
|
||
field.split(",")[0] for field in self.field_name_mapping.values()
|
||
]
|
||
self.inverse_field_name_mapping: Dict[str, str] = {}
|
||
for key, value in self.field_name_mapping.items():
|
||
self.inverse_field_name_mapping[value.split(",")[0]] = key
|
||
|
||
def __getitem__(self, item: str) -> Any:
|
||
return getattr(self, item)
|
||
|
||
|
||
def create_metadata(fields: Dict[str, Any]) -> Dict[str, Any]:
|
||
"""Create metadata from fields.
|
||
|
||
Args:
|
||
fields: The fields of the document. The fields must be a dict.
|
||
|
||
Returns:
|
||
metadata: The metadata of the document. The metadata must be a dict.
|
||
"""
|
||
metadata: Dict[str, Any] = {}
|
||
for key, value in fields.items():
|
||
if key == "id" or key == "document" or key == "embedding":
|
||
continue
|
||
metadata[key] = value
|
||
return metadata
|
||
|
||
|
||
class AlibabaCloudOpenSearch(VectorStore):
|
||
"""`Alibaba Cloud OpenSearch` vector store."""
|
||
|
||
def __init__(
|
||
self,
|
||
embedding: Embeddings,
|
||
config: AlibabaCloudOpenSearchSettings,
|
||
**kwargs: Any,
|
||
) -> None:
|
||
try:
|
||
from alibabacloud_ha3engine_vector import client, models
|
||
from alibabacloud_tea_util import models as util_models
|
||
except ImportError:
|
||
raise ImportError(
|
||
"Could not import alibaba cloud opensearch python package. "
|
||
"Please install it with `pip install alibabacloud-ha3engine-vector`."
|
||
)
|
||
|
||
self.config = config
|
||
self.embedding = embedding
|
||
|
||
self.runtime = util_models.RuntimeOptions(
|
||
connect_timeout=5000,
|
||
read_timeout=10000,
|
||
autoretry=False,
|
||
ignore_ssl=False,
|
||
max_idle_conns=50,
|
||
)
|
||
self.ha3_engine_client = client.Client(
|
||
models.Config(
|
||
endpoint=config.endpoint,
|
||
instance_id=config.instance_id,
|
||
protocol=config.protocol,
|
||
access_user_name=config.username,
|
||
access_pass_word=config.password,
|
||
)
|
||
)
|
||
|
||
self.options_headers: Dict[str, str] = {}
|
||
|
||
def add_texts(
|
||
self,
|
||
texts: Iterable[str],
|
||
metadatas: Optional[List[dict]] = None,
|
||
**kwargs: Any,
|
||
) -> List[str]:
|
||
"""Insert documents into the instance..
|
||
Args:
|
||
texts: The text segments to be inserted into the vector storage,
|
||
should not be empty.
|
||
metadatas: Metadata information.
|
||
Returns:
|
||
id_list: List of document IDs.
|
||
"""
|
||
|
||
def _upsert(push_doc_list: List[Dict]) -> List[str]:
|
||
if push_doc_list is None or len(push_doc_list) == 0:
|
||
return []
|
||
try:
|
||
push_request = models.PushDocumentsRequest(
|
||
self.options_headers, push_doc_list
|
||
)
|
||
push_response = self.ha3_engine_client.push_documents(
|
||
self.config.opt_table_name, field_name_map["id"], push_request
|
||
)
|
||
json_response = json.loads(push_response.body)
|
||
if json_response["status"] == "OK":
|
||
return [
|
||
push_doc["fields"][field_name_map["id"]]
|
||
for push_doc in push_doc_list
|
||
]
|
||
return []
|
||
except Exception as e:
|
||
logger.error(
|
||
f"add doc to endpoint:{self.config.endpoint} "
|
||
f"instance_id:{self.config.instance_id} failed.",
|
||
e,
|
||
)
|
||
raise e
|
||
|
||
from alibabacloud_ha3engine_vector import models
|
||
|
||
id_list = [sha1(t.encode("utf-8")).hexdigest() for t in texts]
|
||
embeddings = self.embedding.embed_documents(list(texts))
|
||
metadatas = metadatas or [{} for _ in texts]
|
||
field_name_map = self.config.field_name_mapping
|
||
add_doc_list = []
|
||
text_list = list(texts)
|
||
for idx, doc_id in enumerate(id_list):
|
||
embedding = embeddings[idx] if idx < len(embeddings) else None
|
||
metadata = metadatas[idx] if idx < len(metadatas) else None
|
||
text = text_list[idx] if idx < len(text_list) else None
|
||
add_doc: Dict[str, Any] = dict()
|
||
add_doc_fields: Dict[str, Any] = dict()
|
||
add_doc_fields.__setitem__(field_name_map["id"], doc_id)
|
||
add_doc_fields.__setitem__(field_name_map["document"], text)
|
||
if embedding is not None:
|
||
add_doc_fields.__setitem__(
|
||
field_name_map["embedding"],
|
||
self.config.embedding_field_separator.join(
|
||
str(unit) for unit in embedding
|
||
),
|
||
)
|
||
if metadata is not None:
|
||
for md_key, md_value in metadata.items():
|
||
add_doc_fields.__setitem__(
|
||
field_name_map[md_key].split(",")[0], md_value
|
||
)
|
||
add_doc.__setitem__("fields", add_doc_fields)
|
||
add_doc.__setitem__("cmd", "add")
|
||
add_doc_list.append(add_doc)
|
||
return _upsert(add_doc_list)
|
||
|
||
def similarity_search(
|
||
self,
|
||
query: str,
|
||
k: int = 4,
|
||
search_filter: Optional[Dict[str, Any]] = None,
|
||
**kwargs: Any,
|
||
) -> List[Document]:
|
||
"""Perform similarity retrieval based on text.
|
||
Args:
|
||
query: Vectorize text for retrieval.,should not be empty.
|
||
k: top n.
|
||
search_filter: Additional filtering conditions.
|
||
Returns:
|
||
document_list: List of documents.
|
||
"""
|
||
embedding = self.embedding.embed_query(query)
|
||
return self.create_results(
|
||
self.inner_embedding_query(
|
||
embedding=embedding, search_filter=search_filter, k=k
|
||
)
|
||
)
|
||
|
||
def similarity_search_with_relevance_scores(
|
||
self,
|
||
query: str,
|
||
k: int = 4,
|
||
search_filter: Optional[dict] = None,
|
||
**kwargs: Any,
|
||
) -> List[Tuple[Document, float]]:
|
||
"""Perform similarity retrieval based on text with scores.
|
||
Args:
|
||
query: Vectorize text for retrieval.,should not be empty.
|
||
k: top n.
|
||
search_filter: Additional filtering conditions.
|
||
Returns:
|
||
document_list: List of documents.
|
||
"""
|
||
embedding: List[float] = self.embedding.embed_query(query)
|
||
return self.create_results_with_score(
|
||
self.inner_embedding_query(
|
||
embedding=embedding, search_filter=search_filter, k=k
|
||
)
|
||
)
|
||
|
||
def similarity_search_by_vector(
|
||
self,
|
||
embedding: List[float],
|
||
k: int = 4,
|
||
search_filter: Optional[dict] = None,
|
||
**kwargs: Any,
|
||
) -> List[Document]:
|
||
"""Perform retrieval directly using vectors.
|
||
Args:
|
||
embedding: vectors.
|
||
k: top n.
|
||
search_filter: Additional filtering conditions.
|
||
Returns:
|
||
document_list: List of documents.
|
||
"""
|
||
return self.create_results(
|
||
self.inner_embedding_query(
|
||
embedding=embedding, search_filter=search_filter, k=k
|
||
)
|
||
)
|
||
|
||
def inner_embedding_query(
|
||
self,
|
||
embedding: List[float],
|
||
search_filter: Optional[Dict[str, Any]] = None,
|
||
k: int = 4,
|
||
) -> Dict[str, Any]:
|
||
def generate_filter_query() -> str:
|
||
if search_filter is None:
|
||
return ""
|
||
filter_clause = " AND ".join(
|
||
[
|
||
create_filter(md_key, md_value)
|
||
for md_key, md_value in search_filter.items()
|
||
]
|
||
)
|
||
return filter_clause
|
||
|
||
def create_filter(md_key: str, md_value: Any) -> str:
|
||
md_filter_expr = self.config.field_name_mapping[md_key]
|
||
if md_filter_expr is None:
|
||
return ""
|
||
expr = md_filter_expr.split(",")
|
||
if len(expr) != 2:
|
||
logger.error(
|
||
f"filter {md_filter_expr} express is not correct, "
|
||
f"must contain mapping field and operator."
|
||
)
|
||
return ""
|
||
md_filter_key = expr[0].strip()
|
||
md_filter_operator = expr[1].strip()
|
||
if isinstance(md_value, numbers.Number):
|
||
return f"{md_filter_key} {md_filter_operator} {md_value}"
|
||
return f'{md_filter_key}{md_filter_operator}"{md_value}"'
|
||
|
||
def search_data() -> Dict[str, Any]:
|
||
request = QueryRequest(
|
||
table_name=self.config.table_name,
|
||
namespace=self.config.namespace,
|
||
vector=embedding,
|
||
include_vector=True,
|
||
output_fields=self.config.output_fields,
|
||
filter=generate_filter_query(),
|
||
top_k=k,
|
||
)
|
||
|
||
query_result = self.ha3_engine_client.query(request)
|
||
return json.loads(query_result.body)
|
||
|
||
from alibabacloud_ha3engine_vector.models import QueryRequest
|
||
|
||
try:
|
||
json_response = search_data()
|
||
if (
|
||
"errorCode" in json_response
|
||
and "errorMsg" in json_response
|
||
and len(json_response["errorMsg"]) > 0
|
||
):
|
||
logger.error(
|
||
f"query {self.config.endpoint} {self.config.instance_id} "
|
||
f"failed:{json_response['errorMsg']}."
|
||
)
|
||
else:
|
||
return json_response
|
||
except Exception as e:
|
||
logger.error(
|
||
f"query instance endpoint:{self.config.endpoint} "
|
||
f"instance_id:{self.config.instance_id} failed.",
|
||
e,
|
||
)
|
||
return {}
|
||
|
||
def create_results(self, json_result: Dict[str, Any]) -> List[Document]:
|
||
"""Assemble documents."""
|
||
items = json_result["result"]
|
||
query_result_list: List[Document] = []
|
||
for item in items:
|
||
if (
|
||
"fields" not in item
|
||
or self.config.field_name_mapping["document"] not in item["fields"]
|
||
):
|
||
query_result_list.append(Document())
|
||
else:
|
||
fields = item["fields"]
|
||
query_result_list.append(
|
||
Document(
|
||
page_content=fields[self.config.field_name_mapping["document"]],
|
||
metadata=self.create_inverse_metadata(fields),
|
||
)
|
||
)
|
||
return query_result_list
|
||
|
||
def create_inverse_metadata(self, fields: Dict[str, Any]) -> Dict[str, Any]:
|
||
"""Create metadata from fields.
|
||
|
||
Args:
|
||
fields: The fields of the document. The fields must be a dict.
|
||
|
||
Returns:
|
||
metadata: The metadata of the document. The metadata must be a dict.
|
||
"""
|
||
metadata: Dict[str, Any] = {}
|
||
for key, value in fields.items():
|
||
if key == "id" or key == "document" or key == "embedding":
|
||
continue
|
||
metadata[self.config.inverse_field_name_mapping[key]] = value
|
||
return metadata
|
||
|
||
def create_results_with_score(
|
||
self, json_result: Dict[str, Any]
|
||
) -> List[Tuple[Document, float]]:
|
||
"""Parsing the returned results with scores.
|
||
Args:
|
||
json_result: Results from OpenSearch query.
|
||
Returns:
|
||
query_result_list: Results with scores.
|
||
"""
|
||
items = json_result["result"]
|
||
query_result_list: List[Tuple[Document, float]] = []
|
||
for item in items:
|
||
fields = item["fields"]
|
||
query_result_list.append(
|
||
(
|
||
Document(
|
||
page_content=fields[self.config.field_name_mapping["document"]],
|
||
metadata=self.create_inverse_metadata(fields),
|
||
),
|
||
float(item["score"]),
|
||
)
|
||
)
|
||
return query_result_list
|
||
|
||
def delete_documents_with_texts(self, texts: List[str]) -> bool:
|
||
"""Delete documents based on their page content.
|
||
|
||
Args:
|
||
texts: List of document page content.
|
||
Returns:
|
||
Whether the deletion was successful or not.
|
||
"""
|
||
id_list = [sha1(t.encode("utf-8")).hexdigest() for t in texts]
|
||
return self.delete_documents_with_document_id(id_list)
|
||
|
||
def delete_documents_with_document_id(self, id_list: List[str]) -> bool:
|
||
"""Delete documents based on their IDs.
|
||
|
||
Args:
|
||
id_list: List of document IDs.
|
||
Returns:
|
||
Whether the deletion was successful or not.
|
||
"""
|
||
if id_list is None or len(id_list) == 0:
|
||
return True
|
||
|
||
from alibabacloud_ha3engine_vector import models
|
||
|
||
delete_doc_list = []
|
||
for doc_id in id_list:
|
||
delete_doc_list.append(
|
||
{
|
||
"fields": {self.config.field_name_mapping["id"]: doc_id},
|
||
"cmd": "delete",
|
||
}
|
||
)
|
||
|
||
delete_request = models.PushDocumentsRequest(
|
||
self.options_headers, delete_doc_list
|
||
)
|
||
try:
|
||
delete_response = self.ha3_engine_client.push_documents(
|
||
self.config.opt_table_name,
|
||
self.config.field_name_mapping["id"],
|
||
delete_request,
|
||
)
|
||
json_response = json.loads(delete_response.body)
|
||
return json_response["status"] == "OK"
|
||
except Exception as e:
|
||
logger.error(
|
||
f"delete doc from :{self.config.endpoint} "
|
||
f"instance_id:{self.config.instance_id} failed.",
|
||
e,
|
||
)
|
||
raise e
|
||
|
||
@classmethod
|
||
def from_texts(
|
||
cls,
|
||
texts: List[str],
|
||
embedding: Embeddings,
|
||
metadatas: Optional[List[dict]] = None,
|
||
config: Optional[AlibabaCloudOpenSearchSettings] = None,
|
||
**kwargs: Any,
|
||
) -> "AlibabaCloudOpenSearch":
|
||
"""Create alibaba cloud opensearch vector store instance.
|
||
|
||
Args:
|
||
texts: The text segments to be inserted into the vector storage,
|
||
should not be empty.
|
||
embedding: Embedding function, Embedding function.
|
||
config: Alibaba OpenSearch instance configuration.
|
||
metadatas: Metadata information.
|
||
Returns:
|
||
AlibabaCloudOpenSearch: Alibaba cloud opensearch vector store instance.
|
||
"""
|
||
if texts is None or len(texts) == 0:
|
||
raise Exception("the inserted text segments, should not be empty.")
|
||
|
||
if embedding is None:
|
||
raise Exception("the embeddings should not be empty.")
|
||
|
||
if config is None:
|
||
raise Exception("config should not be none.")
|
||
|
||
ctx = cls(embedding, config, **kwargs)
|
||
ctx.add_texts(texts=texts, metadatas=metadatas)
|
||
return ctx
|
||
|
||
@classmethod
|
||
def from_documents(
|
||
cls,
|
||
documents: List[Document],
|
||
embedding: Embeddings,
|
||
config: Optional[AlibabaCloudOpenSearchSettings] = None,
|
||
**kwargs: Any,
|
||
) -> "AlibabaCloudOpenSearch":
|
||
"""Create alibaba cloud opensearch vector store instance.
|
||
|
||
Args:
|
||
documents: Documents to be inserted into the vector storage,
|
||
should not be empty.
|
||
embedding: Embedding function, Embedding function.
|
||
config: Alibaba OpenSearch instance configuration.
|
||
ids: Specify the ID for the inserted document. If left empty, the ID will be
|
||
automatically generated based on the text content.
|
||
Returns:
|
||
AlibabaCloudOpenSearch: Alibaba cloud opensearch vector store instance.
|
||
"""
|
||
if documents is None or len(documents) == 0:
|
||
raise Exception("the inserted documents, should not be empty.")
|
||
|
||
if embedding is None:
|
||
raise Exception("the embeddings should not be empty.")
|
||
|
||
if config is None:
|
||
raise Exception("config can't be none")
|
||
|
||
texts = [d.page_content for d in documents]
|
||
metadatas = [d.metadata for d in documents]
|
||
return cls.from_texts(
|
||
texts=texts,
|
||
embedding=embedding,
|
||
metadatas=metadatas,
|
||
config=config,
|
||
**kwargs,
|
||
)
|