community: refactor Arxiv search logic (#27084)

PR message: Description: This PR refactors the Arxiv API wrapper by extracting the Arxiv search logic into a helper function (_fetch_results) to reduce code duplication and improve maintainability. The helper function is used in methods like get_summaries_as_docs, run, and lazy_load, streamlining the code and making it easier to maintain in the future. Issue: This is a minor refactor, so no specific issue is being fixed. Dependencies: No new dependencies are introduced with this change. Add tests and docs: No new integrations were added, so no additional tests or docs are necessary for this PR. Lint and test: I have run make format, make lint, and make test to ensure all checks pass successfully. --------- Co-authored-by: Erick Friis <erick@langchain.dev>
2024-11-13 19:10:52 +00:00 · 2024-10-15 18:43:03 +03:00 · 2024-10-15 18:43:03 +03:00 · 443b37403d
commit 443b37403d
parent 57fbc6bdf1
1 changed files with 21 additions and 27 deletions
--- a/libs/community/langchain_community/utilities/arxiv.py
+++ b/libs/community/langchain_community/utilities/arxiv.py
@ -94,6 +94,16 @@ class ArxivAPIWrapper(BaseModel):
            )
        return values

+    def _fetch_results(self, query: str) -> Any:
+        """Helper function to fetch arxiv results based on query."""
+        if self.is_arxiv_identifier(query):
+            return self.arxiv_search(
+                id_list=query.split(), max_results=self.top_k_results
+            ).results()
+        return self.arxiv_search(
+            query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results
+        ).results()
+
    def get_summaries_as_docs(self, query: str) -> List[Document]:
        """
        Performs an arxiv search and returns list of
@ -107,16 +117,11 @@ class ArxivAPIWrapper(BaseModel):
            query: a plaintext search query
        """
        try:
-            if self.is_arxiv_identifier(query):
-                results = self.arxiv_search(
-                    id_list=query.split(),
-                    max_results=self.top_k_results,
-                ).results()
-            else:
-                results = self.arxiv_search(  # type: ignore
-                    query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results
-                ).results()
+            results = self._fetch_results(
+                query
+            )  # Using helper function to fetch results
        except self.arxiv_exceptions as ex:
+            logger.error(f"Arxiv exception: {ex}")  # Added error logging
            return [Document(page_content=f"Arxiv exception: {ex}")]
        docs = [
            Document(
@ -146,16 +151,11 @@ class ArxivAPIWrapper(BaseModel):
            query: a plaintext search query
        """
        try:
-            if self.is_arxiv_identifier(query):
-                results = self.arxiv_search(
-                    id_list=query.split(),
-                    max_results=self.top_k_results,
-                ).results()
-            else:
-                results = self.arxiv_search(  # type: ignore
-                    query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results
-                ).results()
+            results = self._fetch_results(
+                query
+            )  # Using helper function to fetch results
        except self.arxiv_exceptions as ex:
+            logger.error(f"Arxiv exception: {ex}")  # Added error logging
            return f"Arxiv exception: {ex}"
        docs = [
            f"Published: {result.updated.date()}\n"
@ -208,15 +208,9 @@ class ArxivAPIWrapper(BaseModel):
        try:
            # Remove the ":" and "-" from the query, as they can cause search problems
            query = query.replace(":", "").replace("-", "")
-            if self.is_arxiv_identifier(query):
-                results = self.arxiv_search(
-                    id_list=query[: self.ARXIV_MAX_QUERY_LENGTH].split(),
-                    max_results=self.load_max_docs,
-                ).results()
-            else:
-                results = self.arxiv_search(  # type: ignore
-                    query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.load_max_docs
-                ).results()
+            results = self._fetch_results(
+                query
+            )  # Using helper function to fetch results
        except self.arxiv_exceptions as ex:
            logger.debug("Error on arxiv: %s", ex)
            return