feat: Vertex AI Search - Add Snippet Retrieval for Non-Advanced Website Data Stores (#13020)

https://cloud.google.com/generative-ai-app-builder/docs/snippets#snippets

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
pull/8568/head
Holt Skinner 11 months ago committed by GitHub
parent 3dbaaf59b2
commit 0fc8fd12bd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -36,7 +36,7 @@ class _BaseGoogleVertexAISearchRetriever(BaseModel):
""" Defines the Vertex AI Search data type """ Defines the Vertex AI Search data type
0 - Unstructured data 0 - Unstructured data
1 - Structured data 1 - Structured data
2 - Website data (with Advanced Website Indexing) 2 - Website data
""" """
@root_validator(pre=True) @root_validator(pre=True)
@ -154,7 +154,7 @@ class _BaseGoogleVertexAISearchRetriever(BaseModel):
return documents return documents
def _convert_website_search_response( def _convert_website_search_response(
self, results: Sequence[SearchResult] self, results: Sequence[SearchResult], chunk_type: str
) -> List[Document]: ) -> List[Document]:
"""Converts a sequence of search results to a list of LangChain documents.""" """Converts a sequence of search results to a list of LangChain documents."""
from google.protobuf.json_format import MessageToDict from google.protobuf.json_format import MessageToDict
@ -173,22 +173,24 @@ class _BaseGoogleVertexAISearchRetriever(BaseModel):
doc_metadata["id"] = document_dict["id"] doc_metadata["id"] = document_dict["id"]
doc_metadata["source"] = derived_struct_data.get("link", "") doc_metadata["source"] = derived_struct_data.get("link", "")
chunk_type = "extractive_answers"
if chunk_type not in derived_struct_data: if chunk_type not in derived_struct_data:
continue continue
text_field = "snippet" if chunk_type == "snippets" else "content"
for chunk in derived_struct_data[chunk_type]: for chunk in derived_struct_data[chunk_type]:
documents.append( documents.append(
Document( Document(
page_content=chunk.get("content", ""), metadata=doc_metadata page_content=chunk.get(text_field, ""), metadata=doc_metadata
) )
) )
if not documents: if not documents:
print(f"No {chunk_type} could be found.")
if chunk_type == "extractive_answers":
print( print(
f"No {chunk_type} could be found.\n" "Make sure that your data store is using Advanced Website "
"Make sure that your data store is using Advanced Website Indexing.\n" "Indexing.\n"
"https://cloud.google.com/generative-ai-app-builder/docs/about-advanced-features#advanced-website-indexing" # noqa: E501 "https://cloud.google.com/generative-ai-app-builder/docs/about-advanced-features#advanced-website-indexing" # noqa: E501
) )
@ -206,7 +208,7 @@ class GoogleVertexAISearchRetriever(BaseRetriever, _BaseGoogleVertexAISearchRetr
filter: Optional[str] = None filter: Optional[str] = None
"""Filter expression.""" """Filter expression."""
get_extractive_answers: bool = False get_extractive_answers: bool = False
"""If True return Extractive Answers, otherwise return Extractive Segments.""" """If True return Extractive Answers, otherwise return Extractive Segments or Snippets.""" # noqa: E501
max_documents: int = Field(default=5, ge=1, le=100) max_documents: int = Field(default=5, ge=1, le=100)
"""The maximum number of documents to return.""" """The maximum number of documents to return."""
max_extractive_answer_count: int = Field(default=1, ge=1, le=5) max_extractive_answer_count: int = Field(default=1, ge=1, le=5)
@ -307,12 +309,15 @@ class GoogleVertexAISearchRetriever(BaseRetriever, _BaseGoogleVertexAISearchRetr
content_search_spec = SearchRequest.ContentSearchSpec( content_search_spec = SearchRequest.ContentSearchSpec(
extractive_content_spec=SearchRequest.ContentSearchSpec.ExtractiveContentSpec( extractive_content_spec=SearchRequest.ContentSearchSpec.ExtractiveContentSpec(
max_extractive_answer_count=self.max_extractive_answer_count, max_extractive_answer_count=self.max_extractive_answer_count,
) ),
snippet_spec=SearchRequest.ContentSearchSpec.SnippetSpec(
return_snippet=True
),
) )
else: else:
raise NotImplementedError( raise NotImplementedError(
"Only data store type 0 (Unstructured), 1 (Structured)," "Only data store type 0 (Unstructured), 1 (Structured),"
"or 2 (Website with Advanced Indexing) are supported currently." "or 2 (Website) are supported currently."
+ f" Got {self.engine_data_type}" + f" Got {self.engine_data_type}"
) )
@ -354,11 +359,16 @@ class GoogleVertexAISearchRetriever(BaseRetriever, _BaseGoogleVertexAISearchRetr
elif self.engine_data_type == 1: elif self.engine_data_type == 1:
documents = self._convert_structured_search_response(response.results) documents = self._convert_structured_search_response(response.results)
elif self.engine_data_type == 2: elif self.engine_data_type == 2:
documents = self._convert_website_search_response(response.results) chunk_type = (
"extractive_answers" if self.get_extractive_answers else "snippets"
)
documents = self._convert_website_search_response(
response.results, chunk_type
)
else: else:
raise NotImplementedError( raise NotImplementedError(
"Only data store type 0 (Unstructured), 1 (Structured)," "Only data store type 0 (Unstructured), 1 (Structured),"
"or 2 (Website with Advanced Indexing) are supported currently." "or 2 (Website) are supported currently."
+ f" Got {self.engine_data_type}" + f" Got {self.engine_data_type}"
) )
@ -431,7 +441,9 @@ class GoogleVertexAIMultiTurnSearchRetriever(
response = self._client.converse_conversation(request) response = self._client.converse_conversation(request)
if self.engine_data_type == 2: if self.engine_data_type == 2:
return self._convert_website_search_response(response.search_results) return self._convert_website_search_response(
response.search_results, "extractive_answers"
)
return self._convert_unstructured_search_response( return self._convert_unstructured_search_response(
response.search_results, "extractive_answers" response.search_results, "extractive_answers"

Loading…
Cancel
Save