From fad26e79a31d78223f1f8ed05b1d463ea67ae5c5 Mon Sep 17 00:00:00 2001 From: Holt Skinner <13262395+holtskinner@users.noreply.github.com> Date: Tue, 8 Aug 2023 14:11:12 -0500 Subject: [PATCH] fix: Resolve `AttributeError` in Google Cloud Enterprise Search retriever (#8872) - Reverting some of the changes made in https://github.com/langchain-ai/langchain/pull/8369 --- .../google_cloud_enterprise_search.py | 39 +++++++++++++------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/libs/langchain/langchain/retrievers/google_cloud_enterprise_search.py b/libs/langchain/langchain/retrievers/google_cloud_enterprise_search.py index cab8c02d0b..0d76022ef7 100644 --- a/libs/langchain/langchain/retrievers/google_cloud_enterprise_search.py +++ b/libs/langchain/langchain/retrievers/google_cloud_enterprise_search.py @@ -116,22 +116,37 @@ class GoogleCloudEnterpriseSearchRetriever(BaseRetriever): self, results: Sequence[SearchResult] ) -> List[Document]: """Converts a sequence of search results to a list of LangChain documents.""" + from google.protobuf.json_format import MessageToDict + documents: List[Document] = [] for result in results: - derived_struct_data = result.document.derived_struct_data - doc_metadata = result.document.struct_data - doc_metadata.source = derived_struct_data.link or "" - doc_metadata.id = result.document.id - - for chunk in ( - derived_struct_data.extractive_answers - or derived_struct_data.extractive_segments - ): - if hasattr(chunk, "page_number"): - doc_metadata.source += f":{chunk.page_number}" + document_dict = MessageToDict( + result.document._pb, preserving_proto_field_name=True + ) + derived_struct_data = document_dict.get("derived_struct_data", None) + if not derived_struct_data: + continue + + doc_metadata = document_dict.get("struct_data", {}) + doc_metadata["id"] = document_dict["id"] + + chunk_type = ( + "extractive_answers" + if self.get_extractive_answers + else "extractive_segments" + ) + + for chunk in getattr(derived_struct_data, chunk_type, []): + doc_metadata["source"] = derived_struct_data.get("link", "") + + if chunk_type == "extractive_answers": + doc_metadata["source"] += f":{chunk.get('pageNumber', '')}" + documents.append( - Document(page_content=chunk.content, metadata=doc_metadata) + Document( + page_content=chunk.get("content", ""), metadata=doc_metadata + ) ) return documents