refactor: Code refactoring & simplification for Google Cloud Enterprise Search retriever (#8369)

Followup to https://github.com/langchain-ai/langchain/pull/7857

- Changes `_convert_search_response()` to use object attributes instead
of converting to dictionary
- Simplifies logic for readability
This commit is contained in:
Holt Skinner 2023-07-27 19:13:49 -05:00 committed by GitHub
parent 594f195e54
commit d7e6770de8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -106,34 +106,23 @@ class GoogleCloudEnterpriseSearchRetriever(BaseRetriever):
self, results: Sequence[SearchResult] self, results: Sequence[SearchResult]
) -> List[Document]: ) -> List[Document]:
"""Converts a sequence of search results to a list of LangChain documents.""" """Converts a sequence of search results to a list of LangChain documents."""
from google.protobuf.json_format import MessageToDict documents: List[Document] = []
documents = []
for result in results: for result in results:
document_dict = MessageToDict(result.document._pb) derived_struct_data = result.document.derived_struct_data
derived_struct_data = document_dict.get("derivedStructData", None) doc_metadata = result.document.struct_data
if derived_struct_data: doc_metadata.source = derived_struct_data.link or ""
doc_metadata = document_dict.get("structData", {}) doc_metadata.id = result.document.id
chunk_type = (
"extractive_answers" for chunk in (
if self.get_extractive_answers derived_struct_data.extractive_answers
else "extractive_segments" or derived_struct_data.extractive_segments
):
if hasattr(chunk, "page_number"):
doc_metadata.source += f":{chunk.page_number}"
documents.append(
Document(page_content=chunk.content, metadata=doc_metadata)
) )
for chunk in derived_struct_data.get(chunk_type, []):
if chunk_type == "extractive_answers":
doc_metadata["source"] = (
f"{derived_struct_data.get('link', '')}"
f":{chunk.get('pageNumber', '')}"
)
else:
doc_metadata[
"source"
] = f"{derived_struct_data.get('link', '')}"
doc_metadata["id"] = document_dict["id"]
document = Document(
page_content=chunk.get("content", ""), metadata=doc_metadata
)
documents.append(document)
return documents return documents
@ -162,7 +151,7 @@ class GoogleCloudEnterpriseSearchRetriever(BaseRetriever):
extractive_content_spec=extractive_content_spec, extractive_content_spec=extractive_content_spec,
) )
request = SearchRequest( return SearchRequest(
query=query, query=query,
filter=self.filter, filter=self.filter,
serving_config=self._serving_config, serving_config=self._serving_config,
@ -171,8 +160,6 @@ class GoogleCloudEnterpriseSearchRetriever(BaseRetriever):
query_expansion_spec=query_expansion_spec, query_expansion_spec=query_expansion_spec,
) )
return request
def _get_relevant_documents( def _get_relevant_documents(
self, query: str, *, run_manager: CallbackManagerForRetrieverRun self, query: str, *, run_manager: CallbackManagerForRetrieverRun
) -> List[Document]: ) -> List[Document]: