diff --git a/docs/docs/additional_resources/arxiv_references.mdx b/docs/docs/additional_resources/arxiv_references.mdx new file mode 100644 index 0000000000..3e258a4a82 --- /dev/null +++ b/docs/docs/additional_resources/arxiv_references.mdx @@ -0,0 +1,519 @@ +# arXiv + +LangChain implements the latest research in the field of Natural Language Processing. +This page contains `arXiv` papers referenced in the LangChain Documentation and API Reference. + +## Summary + +| arXiv id / Title | Authors | Published date 🔻 | LangChain Documentation and API Reference | +|------------------|---------|-------------------|-------------------------| +| `2307.03172v3` [Lost in the Middle: How Language Models Use Long Contexts](http://arxiv.org/abs/2307.03172v3) | Nelson F. Liu, Kevin Lin, John Hewitt, et al. | 2023-07-06 | `Docs:` [docs/modules/data_connection/retrievers/long_context_reorder](https://python.langchain.com/docs/modules/data_connection/retrievers/long_context_reorder) +| `2305.08291v1` [Large Language Model Guided Tree-of-Thought](http://arxiv.org/abs/2305.08291v1) | Jieyi Long | 2023-05-15 | `API:` [langchain_experimental.tot](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.tot) +| `2305.06983v2` [Active Retrieval Augmented Generation](http://arxiv.org/abs/2305.06983v2) | Zhengbao Jiang, Frank F. Xu, Luyu Gao, et al. | 2023-05-11 | `Docs:` [docs/modules/chains](https://python.langchain.com/docs/modules/chains) +| `2303.17580v4` [HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face](http://arxiv.org/abs/2303.17580v4) | Yongliang Shen, Kaitao Song, Xu Tan, et al. | 2023-03-30 | `API:` [langchain_experimental.autonomous_agents](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.autonomous_agents) +| `2303.08774v6` [GPT-4 Technical Report](http://arxiv.org/abs/2303.08774v6) | OpenAI, Josh Achiam, Steven Adler, et al. | 2023-03-15 | `Docs:` [docs/integrations/vectorstores/mongodb_atlas](https://python.langchain.com/docs/integrations/vectorstores/mongodb_atlas) +| `2301.10226v4` [A Watermark for Large Language Models](http://arxiv.org/abs/2301.10226v4) | John Kirchenbauer, Jonas Geiping, Yuxin Wen, et al. | 2023-01-24 | `API:` [langchain_community.llms...HuggingFaceTextGenInference](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference.html#langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference), [langchain_community.llms...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_community.llms...OCIModelDeploymentTGI](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.oci_data_science_model_deployment_endpoint.OCIModelDeploymentTGI.html#langchain_community.llms.oci_data_science_model_deployment_endpoint.OCIModelDeploymentTGI) +| `2212.10496v1` [Precise Zero-Shot Dense Retrieval without Relevance Labels](http://arxiv.org/abs/2212.10496v1) | Luyu Gao, Xueguang Ma, Jimmy Lin, et al. | 2022-12-20 | `Docs:` [docs/use_cases/query_analysis/techniques/hyde](https://python.langchain.com/docs/use_cases/query_analysis/techniques/hyde), `API:` [langchain.chains...HypotheticalDocumentEmbedder](https://api.python.langchain.com/en/latest/chains/langchain.chains.hyde.base.HypotheticalDocumentEmbedder.html#langchain.chains.hyde.base.HypotheticalDocumentEmbedder) +| `2212.08073v1` [Constitutional AI: Harmlessness from AI Feedback](http://arxiv.org/abs/2212.08073v1) | Yuntao Bai, Saurav Kadavath, Sandipan Kundu, et al. | 2022-12-15 | `Docs:` [docs/guides/productionization/evaluation/string/criteria_eval_chain](https://python.langchain.com/docs/guides/productionization/evaluation/string/criteria_eval_chain) +| `2212.07425v3` [Robust and Explainable Identification of Logical Fallacies in Natural Language Arguments](http://arxiv.org/abs/2212.07425v3) | Zhivar Sourati, Vishnu Priya Prasanna Venkatesh, Darshan Deshpande, et al. | 2022-12-12 | `API:` [langchain_experimental.fallacy_removal](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.fallacy_removal) +| `2211.13892v2` [Complementary Explanations for Effective In-Context Learning](http://arxiv.org/abs/2211.13892v2) | Xi Ye, Srinivasan Iyer, Asli Celikyilmaz, et al. | 2022-11-25 | `API:` [langchain_core.example_selectors...MaxMarginalRelevanceExampleSelector](https://api.python.langchain.com/en/latest/example_selectors/langchain_core.example_selectors.semantic_similarity.MaxMarginalRelevanceExampleSelector.html#langchain_core.example_selectors.semantic_similarity.MaxMarginalRelevanceExampleSelector) +| `2211.10435v2` [PAL: Program-aided Language Models](http://arxiv.org/abs/2211.10435v2) | Luyu Gao, Aman Madaan, Shuyan Zhou, et al. | 2022-11-18 | `API:` [langchain_experimental.pal_chain...PALChain](https://api.python.langchain.com/en/latest/pal_chain/langchain_experimental.pal_chain.base.PALChain.html#langchain_experimental.pal_chain.base.PALChain), [langchain_experimental.pal_chain](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.pal_chain) +| `2209.10785v2` [Deep Lake: a Lakehouse for Deep Learning](http://arxiv.org/abs/2209.10785v2) | Sasun Hambardzumyan, Abhinav Tuli, Levon Ghukasyan, et al. | 2022-09-22 | `Docs:` [docs/integrations/providers/activeloop_deeplake](https://python.langchain.com/docs/integrations/providers/activeloop_deeplake) +| `2205.12654v1` [Bitext Mining Using Distilled Sentence Representations for Low-Resource Languages](http://arxiv.org/abs/2205.12654v1) | Kevin Heffernan, Onur Çelebi, Holger Schwenk | 2022-05-25 | `API:` [langchain_community.embeddings...LaserEmbeddings](https://api.python.langchain.com/en/latest/embeddings/langchain_community.embeddings.laser.LaserEmbeddings.html#langchain_community.embeddings.laser.LaserEmbeddings) +| `2204.00498v1` [Evaluating the Text-to-SQL Capabilities of Large Language Models](http://arxiv.org/abs/2204.00498v1) | Nitarshan Rajkumar, Raymond Li, Dzmitry Bahdanau | 2022-03-15 | `Docs:` [docs/use_cases/sql/quickstart](https://python.langchain.com/docs/use_cases/sql/quickstart), `API:` [langchain_community.utilities...SQLDatabase](https://api.python.langchain.com/en/latest/utilities/langchain_community.utilities.sql_database.SQLDatabase.html#langchain_community.utilities.sql_database.SQLDatabase), [langchain_community.utilities...SparkSQL](https://api.python.langchain.com/en/latest/utilities/langchain_community.utilities.spark_sql.SparkSQL.html#langchain_community.utilities.spark_sql.SparkSQL) +| `2202.00666v5` [Locally Typical Sampling](http://arxiv.org/abs/2202.00666v5) | Clara Meister, Tiago Pimentel, Gian Wiher, et al. | 2022-02-01 | `API:` [langchain_community.llms...HuggingFaceTextGenInference](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference.html#langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference), [langchain_community.llms...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint) +| `2103.00020v1` [Learning Transferable Visual Models From Natural Language Supervision](http://arxiv.org/abs/2103.00020v1) | Alec Radford, Jong Wook Kim, Chris Hallacy, et al. | 2021-02-26 | `API:` [langchain_experimental.open_clip](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.open_clip) +| `1909.05858v2` [CTRL: A Conditional Transformer Language Model for Controllable Generation](http://arxiv.org/abs/1909.05858v2) | Nitish Shirish Keskar, Bryan McCann, Lav R. Varshney, et al. | 2019-09-11 | `API:` [langchain_community.llms...HuggingFaceTextGenInference](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference.html#langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference), [langchain_community.llms...HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint) +| `1908.10084v1` [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](http://arxiv.org/abs/1908.10084v1) | Nils Reimers, Iryna Gurevych | 2019-08-27 | `Docs:` [docs/integrations/text_embedding/sentence_transformers](https://python.langchain.com/docs/integrations/text_embedding/sentence_transformers) + +## Lost in the Middle: How Language Models Use Long Contexts + +- **arXiv id:** 2307.03172v3 +- **Title:** Lost in the Middle: How Language Models Use Long Contexts +- **Authors:** Nelson F. Liu, Kevin Lin, John Hewitt, et al. +- **Published Date:** 2023-07-06 +- **URL:** http://arxiv.org/abs/2307.03172v3 +- **LangChain Documentation:** [docs/modules/data_connection/retrievers/long_context_reorder](https://python.langchain.com/docs/modules/data_connection/retrievers/long_context_reorder) + + +**Abstract:** While recent language models have the ability to take long contexts as input, +relatively little is known about how well they use longer context. We analyze +the performance of language models on two tasks that require identifying +relevant information in their input contexts: multi-document question answering +and key-value retrieval. We find that performance can degrade significantly +when changing the position of relevant information, indicating that current +language models do not robustly make use of information in long input contexts. +In particular, we observe that performance is often highest when relevant +information occurs at the beginning or end of the input context, and +significantly degrades when models must access relevant information in the +middle of long contexts, even for explicitly long-context models. Our analysis +provides a better understanding of how language models use their input context +and provides new evaluation protocols for future long-context language models. + +## Large Language Model Guided Tree-of-Thought + +- **arXiv id:** 2305.08291v1 +- **Title:** Large Language Model Guided Tree-of-Thought +- **Authors:** Jieyi Long +- **Published Date:** 2023-05-15 +- **URL:** http://arxiv.org/abs/2305.08291v1 + +- **LangChain API Reference:** [langchain_experimental.tot](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.tot) + +**Abstract:** In this paper, we introduce the Tree-of-Thought (ToT) framework, a novel +approach aimed at improving the problem-solving capabilities of auto-regressive +large language models (LLMs). The ToT technique is inspired by the human mind's +approach for solving complex reasoning tasks through trial and error. In this +process, the human mind explores the solution space through a tree-like thought +process, allowing for backtracking when necessary. To implement ToT as a +software system, we augment an LLM with additional modules including a prompter +agent, a checker module, a memory module, and a ToT controller. In order to +solve a given problem, these modules engage in a multi-round conversation with +the LLM. The memory module records the conversation and state history of the +problem solving process, which allows the system to backtrack to the previous +steps of the thought-process and explore other directions from there. To verify +the effectiveness of the proposed technique, we implemented a ToT-based solver +for the Sudoku Puzzle. Experimental results show that the ToT framework can +significantly increase the success rate of Sudoku puzzle solving. Our +implementation of the ToT-based Sudoku solver is available on GitHub: +\url{https://github.com/jieyilong/tree-of-thought-puzzle-solver}. + +## Active Retrieval Augmented Generation + +- **arXiv id:** 2305.06983v2 +- **Title:** Active Retrieval Augmented Generation +- **Authors:** Zhengbao Jiang, Frank F. Xu, Luyu Gao, et al. +- **Published Date:** 2023-05-11 +- **URL:** http://arxiv.org/abs/2305.06983v2 +- **LangChain Documentation:** [docs/modules/chains](https://python.langchain.com/docs/modules/chains) + + +**Abstract:** Despite the remarkable ability of large language models (LMs) to comprehend +and generate language, they have a tendency to hallucinate and create factually +inaccurate output. Augmenting LMs by retrieving information from external +knowledge resources is one promising solution. Most existing retrieval +augmented LMs employ a retrieve-and-generate setup that only retrieves +information once based on the input. This is limiting, however, in more general +scenarios involving generation of long texts, where continually gathering +information throughout generation is essential. In this work, we provide a +generalized view of active retrieval augmented generation, methods that +actively decide when and what to retrieve across the course of the generation. +We propose Forward-Looking Active REtrieval augmented generation (FLARE), a +generic method which iteratively uses a prediction of the upcoming sentence to +anticipate future content, which is then utilized as a query to retrieve +relevant documents to regenerate the sentence if it contains low-confidence +tokens. We test FLARE along with baselines comprehensively over 4 long-form +knowledge-intensive generation tasks/datasets. FLARE achieves superior or +competitive performance on all tasks, demonstrating the effectiveness of our +method. Code and datasets are available at https://github.com/jzbjyb/FLARE. + +## HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face + +- **arXiv id:** 2303.17580v4 +- **Title:** HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face +- **Authors:** Yongliang Shen, Kaitao Song, Xu Tan, et al. +- **Published Date:** 2023-03-30 +- **URL:** http://arxiv.org/abs/2303.17580v4 + +- **LangChain API Reference:** [langchain_experimental.autonomous_agents](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.autonomous_agents) + +**Abstract:** Solving complicated AI tasks with different domains and modalities is a key +step toward artificial general intelligence. While there are numerous AI models +available for various domains and modalities, they cannot handle complicated AI +tasks autonomously. Considering large language models (LLMs) have exhibited +exceptional abilities in language understanding, generation, interaction, and +reasoning, we advocate that LLMs could act as a controller to manage existing +AI models to solve complicated AI tasks, with language serving as a generic +interface to empower this. Based on this philosophy, we present HuggingGPT, an +LLM-powered agent that leverages LLMs (e.g., ChatGPT) to connect various AI +models in machine learning communities (e.g., Hugging Face) to solve AI tasks. +Specifically, we use ChatGPT to conduct task planning when receiving a user +request, select models according to their function descriptions available in +Hugging Face, execute each subtask with the selected AI model, and summarize +the response according to the execution results. By leveraging the strong +language capability of ChatGPT and abundant AI models in Hugging Face, +HuggingGPT can tackle a wide range of sophisticated AI tasks spanning different +modalities and domains and achieve impressive results in language, vision, +speech, and other challenging tasks, which paves a new way towards the +realization of artificial general intelligence. + +## GPT-4 Technical Report + +- **arXiv id:** 2303.08774v6 +- **Title:** GPT-4 Technical Report +- **Authors:** OpenAI, Josh Achiam, Steven Adler, et al. +- **Published Date:** 2023-03-15 +- **URL:** http://arxiv.org/abs/2303.08774v6 +- **LangChain Documentation:** [docs/integrations/vectorstores/mongodb_atlas](https://python.langchain.com/docs/integrations/vectorstores/mongodb_atlas) + + +**Abstract:** We report the development of GPT-4, a large-scale, multimodal model which can +accept image and text inputs and produce text outputs. While less capable than +humans in many real-world scenarios, GPT-4 exhibits human-level performance on +various professional and academic benchmarks, including passing a simulated bar +exam with a score around the top 10% of test takers. GPT-4 is a +Transformer-based model pre-trained to predict the next token in a document. +The post-training alignment process results in improved performance on measures +of factuality and adherence to desired behavior. A core component of this +project was developing infrastructure and optimization methods that behave +predictably across a wide range of scales. This allowed us to accurately +predict some aspects of GPT-4's performance based on models trained with no +more than 1/1,000th the compute of GPT-4. + +## A Watermark for Large Language Models + +- **arXiv id:** 2301.10226v4 +- **Title:** A Watermark for Large Language Models +- **Authors:** John Kirchenbauer, Jonas Geiping, Yuxin Wen, et al. +- **Published Date:** 2023-01-24 +- **URL:** http://arxiv.org/abs/2301.10226v4 + +- **LangChain API Reference:** [langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference.html#langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference), [langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint), [langchain_community.llms.oci_data_science_model_deployment_endpoint.OCIModelDeploymentTGI](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.oci_data_science_model_deployment_endpoint.OCIModelDeploymentTGI.html#langchain_community.llms.oci_data_science_model_deployment_endpoint.OCIModelDeploymentTGI) + +**Abstract:** Potential harms of large language models can be mitigated by watermarking +model output, i.e., embedding signals into generated text that are invisible to +humans but algorithmically detectable from a short span of tokens. We propose a +watermarking framework for proprietary language models. The watermark can be +embedded with negligible impact on text quality, and can be detected using an +efficient open-source algorithm without access to the language model API or +parameters. The watermark works by selecting a randomized set of "green" tokens +before a word is generated, and then softly promoting use of green tokens +during sampling. We propose a statistical test for detecting the watermark with +interpretable p-values, and derive an information-theoretic framework for +analyzing the sensitivity of the watermark. We test the watermark using a +multi-billion parameter model from the Open Pretrained Transformer (OPT) +family, and discuss robustness and security. + +## Precise Zero-Shot Dense Retrieval without Relevance Labels + +- **arXiv id:** 2212.10496v1 +- **Title:** Precise Zero-Shot Dense Retrieval without Relevance Labels +- **Authors:** Luyu Gao, Xueguang Ma, Jimmy Lin, et al. +- **Published Date:** 2022-12-20 +- **URL:** http://arxiv.org/abs/2212.10496v1 +- **LangChain Documentation:** [docs/use_cases/query_analysis/techniques/hyde](https://python.langchain.com/docs/use_cases/query_analysis/techniques/hyde) +- **LangChain API Reference:** [langchain.chains.hyde.base.HypotheticalDocumentEmbedder](https://api.python.langchain.com/en/latest/chains/langchain.chains.hyde.base.HypotheticalDocumentEmbedder.html#langchain.chains.hyde.base.HypotheticalDocumentEmbedder) + +**Abstract:** While dense retrieval has been shown effective and efficient across tasks and +languages, it remains difficult to create effective fully zero-shot dense +retrieval systems when no relevance label is available. In this paper, we +recognize the difficulty of zero-shot learning and encoding relevance. Instead, +we propose to pivot through Hypothetical Document Embeddings~(HyDE). Given a +query, HyDE first zero-shot instructs an instruction-following language model +(e.g. InstructGPT) to generate a hypothetical document. The document captures +relevance patterns but is unreal and may contain false details. Then, an +unsupervised contrastively learned encoder~(e.g. Contriever) encodes the +document into an embedding vector. This vector identifies a neighborhood in the +corpus embedding space, where similar real documents are retrieved based on +vector similarity. This second step ground the generated document to the actual +corpus, with the encoder's dense bottleneck filtering out the incorrect +details. Our experiments show that HyDE significantly outperforms the +state-of-the-art unsupervised dense retriever Contriever and shows strong +performance comparable to fine-tuned retrievers, across various tasks (e.g. web +search, QA, fact verification) and languages~(e.g. sw, ko, ja). + +## Constitutional AI: Harmlessness from AI Feedback + +- **arXiv id:** 2212.08073v1 +- **Title:** Constitutional AI: Harmlessness from AI Feedback +- **Authors:** Yuntao Bai, Saurav Kadavath, Sandipan Kundu, et al. +- **Published Date:** 2022-12-15 +- **URL:** http://arxiv.org/abs/2212.08073v1 +- **LangChain Documentation:** [docs/guides/productionization/evaluation/string/criteria_eval_chain](https://python.langchain.com/docs/guides/productionization/evaluation/string/criteria_eval_chain) + + +**Abstract:** As AI systems become more capable, we would like to enlist their help to +supervise other AIs. We experiment with methods for training a harmless AI +assistant through self-improvement, without any human labels identifying +harmful outputs. The only human oversight is provided through a list of rules +or principles, and so we refer to the method as 'Constitutional AI'. The +process involves both a supervised learning and a reinforcement learning phase. +In the supervised phase we sample from an initial model, then generate +self-critiques and revisions, and then finetune the original model on revised +responses. In the RL phase, we sample from the finetuned model, use a model to +evaluate which of the two samples is better, and then train a preference model +from this dataset of AI preferences. We then train with RL using the preference +model as the reward signal, i.e. we use 'RL from AI Feedback' (RLAIF). As a +result we are able to train a harmless but non-evasive AI assistant that +engages with harmful queries by explaining its objections to them. Both the SL +and RL methods can leverage chain-of-thought style reasoning to improve the +human-judged performance and transparency of AI decision making. These methods +make it possible to control AI behavior more precisely and with far fewer human +labels. + +## Robust and Explainable Identification of Logical Fallacies in Natural Language Arguments + +- **arXiv id:** 2212.07425v3 +- **Title:** Robust and Explainable Identification of Logical Fallacies in Natural Language Arguments +- **Authors:** Zhivar Sourati, Vishnu Priya Prasanna Venkatesh, Darshan Deshpande, et al. +- **Published Date:** 2022-12-12 +- **URL:** http://arxiv.org/abs/2212.07425v3 + +- **LangChain API Reference:** [langchain_experimental.fallacy_removal](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.fallacy_removal) + +**Abstract:** The spread of misinformation, propaganda, and flawed argumentation has been +amplified in the Internet era. Given the volume of data and the subtlety of +identifying violations of argumentation norms, supporting information analytics +tasks, like content moderation, with trustworthy methods that can identify +logical fallacies is essential. In this paper, we formalize prior theoretical +work on logical fallacies into a comprehensive three-stage evaluation framework +of detection, coarse-grained, and fine-grained classification. We adapt +existing evaluation datasets for each stage of the evaluation. We employ three +families of robust and explainable methods based on prototype reasoning, +instance-based reasoning, and knowledge injection. The methods combine language +models with background knowledge and explainable mechanisms. Moreover, we +address data sparsity with strategies for data augmentation and curriculum +learning. Our three-stage framework natively consolidates prior datasets and +methods from existing tasks, like propaganda detection, serving as an +overarching evaluation testbed. We extensively evaluate these methods on our +datasets, focusing on their robustness and explainability. Our results provide +insight into the strengths and weaknesses of the methods on different +components and fallacy classes, indicating that fallacy identification is a +challenging task that may require specialized forms of reasoning to capture +various classes. We share our open-source code and data on GitHub to support +further work on logical fallacy identification. + +## Complementary Explanations for Effective In-Context Learning + +- **arXiv id:** 2211.13892v2 +- **Title:** Complementary Explanations for Effective In-Context Learning +- **Authors:** Xi Ye, Srinivasan Iyer, Asli Celikyilmaz, et al. +- **Published Date:** 2022-11-25 +- **URL:** http://arxiv.org/abs/2211.13892v2 + +- **LangChain API Reference:** [langchain_core.example_selectors.semantic_similarity.MaxMarginalRelevanceExampleSelector](https://api.python.langchain.com/en/latest/example_selectors/langchain_core.example_selectors.semantic_similarity.MaxMarginalRelevanceExampleSelector.html#langchain_core.example_selectors.semantic_similarity.MaxMarginalRelevanceExampleSelector) + +**Abstract:** Large language models (LLMs) have exhibited remarkable capabilities in +learning from explanations in prompts, but there has been limited understanding +of exactly how these explanations function or why they are effective. This work +aims to better understand the mechanisms by which explanations are used for +in-context learning. We first study the impact of two different factors on the +performance of prompts with explanations: the computation trace (the way the +solution is decomposed) and the natural language used to express the prompt. By +perturbing explanations on three controlled tasks, we show that both factors +contribute to the effectiveness of explanations. We further study how to form +maximally effective sets of explanations for solving a given test query. We +find that LLMs can benefit from the complementarity of the explanation set: +diverse reasoning skills shown by different exemplars can lead to better +performance. Therefore, we propose a maximal marginal relevance-based exemplar +selection approach for constructing exemplar sets that are both relevant as +well as complementary, which successfully improves the in-context learning +performance across three real-world tasks on multiple LLMs. + +## PAL: Program-aided Language Models + +- **arXiv id:** 2211.10435v2 +- **Title:** PAL: Program-aided Language Models +- **Authors:** Luyu Gao, Aman Madaan, Shuyan Zhou, et al. +- **Published Date:** 2022-11-18 +- **URL:** http://arxiv.org/abs/2211.10435v2 + +- **LangChain API Reference:** [langchain_experimental.pal_chain.base.PALChain](https://api.python.langchain.com/en/latest/pal_chain/langchain_experimental.pal_chain.base.PALChain.html#langchain_experimental.pal_chain.base.PALChain), [langchain_experimental.pal_chain](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.pal_chain) + +**Abstract:** Large language models (LLMs) have recently demonstrated an impressive ability +to perform arithmetic and symbolic reasoning tasks, when provided with a few +examples at test time ("few-shot prompting"). Much of this success can be +attributed to prompting methods such as "chain-of-thought'', which employ LLMs +for both understanding the problem description by decomposing it into steps, as +well as solving each step of the problem. While LLMs seem to be adept at this +sort of step-by-step decomposition, LLMs often make logical and arithmetic +mistakes in the solution part, even when the problem is decomposed correctly. +In this paper, we present Program-Aided Language models (PAL): a novel approach +that uses the LLM to read natural language problems and generate programs as +the intermediate reasoning steps, but offloads the solution step to a runtime +such as a Python interpreter. With PAL, decomposing the natural language +problem into runnable steps remains the only learning task for the LLM, while +solving is delegated to the interpreter. We demonstrate this synergy between a +neural LLM and a symbolic interpreter across 13 mathematical, symbolic, and +algorithmic reasoning tasks from BIG-Bench Hard and other benchmarks. In all +these natural language reasoning tasks, generating code using an LLM and +reasoning using a Python interpreter leads to more accurate results than much +larger models. For example, PAL using Codex achieves state-of-the-art few-shot +accuracy on the GSM8K benchmark of math word problems, surpassing PaLM-540B +which uses chain-of-thought by absolute 15% top-1. Our code and data are +publicly available at http://reasonwithpal.com/ . + +## Deep Lake: a Lakehouse for Deep Learning + +- **arXiv id:** 2209.10785v2 +- **Title:** Deep Lake: a Lakehouse for Deep Learning +- **Authors:** Sasun Hambardzumyan, Abhinav Tuli, Levon Ghukasyan, et al. +- **Published Date:** 2022-09-22 +- **URL:** http://arxiv.org/abs/2209.10785v2 +- **LangChain Documentation:** [docs/integrations/providers/activeloop_deeplake](https://python.langchain.com/docs/integrations/providers/activeloop_deeplake) + + +**Abstract:** Traditional data lakes provide critical data infrastructure for analytical +workloads by enabling time travel, running SQL queries, ingesting data with +ACID transactions, and visualizing petabyte-scale datasets on cloud storage. +They allow organizations to break down data silos, unlock data-driven +decision-making, improve operational efficiency, and reduce costs. However, as +deep learning usage increases, traditional data lakes are not well-designed for +applications such as natural language processing (NLP), audio processing, +computer vision, and applications involving non-tabular datasets. This paper +presents Deep Lake, an open-source lakehouse for deep learning applications +developed at Activeloop. Deep Lake maintains the benefits of a vanilla data +lake with one key difference: it stores complex data, such as images, videos, +annotations, as well as tabular data, in the form of tensors and rapidly +streams the data over the network to (a) Tensor Query Language, (b) in-browser +visualization engine, or (c) deep learning frameworks without sacrificing GPU +utilization. Datasets stored in Deep Lake can be accessed from PyTorch, +TensorFlow, JAX, and integrate with numerous MLOps tools. + +## Bitext Mining Using Distilled Sentence Representations for Low-Resource Languages + +- **arXiv id:** 2205.12654v1 +- **Title:** Bitext Mining Using Distilled Sentence Representations for Low-Resource Languages +- **Authors:** Kevin Heffernan, Onur Çelebi, Holger Schwenk +- **Published Date:** 2022-05-25 +- **URL:** http://arxiv.org/abs/2205.12654v1 + +- **LangChain API Reference:** [langchain_community.embeddings.laser.LaserEmbeddings](https://api.python.langchain.com/en/latest/embeddings/langchain_community.embeddings.laser.LaserEmbeddings.html#langchain_community.embeddings.laser.LaserEmbeddings) + +**Abstract:** Scaling multilingual representation learning beyond the hundred most frequent +languages is challenging, in particular to cover the long tail of low-resource +languages. A promising approach has been to train one-for-all multilingual +models capable of cross-lingual transfer, but these models often suffer from +insufficient capacity and interference between unrelated languages. Instead, we +move away from this approach and focus on training multiple language (family) +specific representations, but most prominently enable all languages to still be +encoded in the same representational space. To achieve this, we focus on +teacher-student training, allowing all encoders to be mutually compatible for +bitext mining, and enabling fast learning of new languages. We introduce a new +teacher-student training scheme which combines supervised and self-supervised +training, allowing encoders to take advantage of monolingual training data, +which is valuable in the low-resource setting. + Our approach significantly outperforms the original LASER encoder. We study +very low-resource languages and handle 50 African languages, many of which are +not covered by any other model. For these languages, we train sentence +encoders, mine bitexts, and validate the bitexts by training NMT systems. + +## Evaluating the Text-to-SQL Capabilities of Large Language Models + +- **arXiv id:** 2204.00498v1 +- **Title:** Evaluating the Text-to-SQL Capabilities of Large Language Models +- **Authors:** Nitarshan Rajkumar, Raymond Li, Dzmitry Bahdanau +- **Published Date:** 2022-03-15 +- **URL:** http://arxiv.org/abs/2204.00498v1 +- **LangChain Documentation:** [docs/use_cases/sql/quickstart](https://python.langchain.com/docs/use_cases/sql/quickstart) +- **LangChain API Reference:** [langchain_community.utilities.sql_database.SQLDatabase](https://api.python.langchain.com/en/latest/utilities/langchain_community.utilities.sql_database.SQLDatabase.html#langchain_community.utilities.sql_database.SQLDatabase), [langchain_community.utilities.spark_sql.SparkSQL](https://api.python.langchain.com/en/latest/utilities/langchain_community.utilities.spark_sql.SparkSQL.html#langchain_community.utilities.spark_sql.SparkSQL) + +**Abstract:** We perform an empirical evaluation of Text-to-SQL capabilities of the Codex +language model. We find that, without any finetuning, Codex is a strong +baseline on the Spider benchmark; we also analyze the failure modes of Codex in +this setting. Furthermore, we demonstrate on the GeoQuery and Scholar +benchmarks that a small number of in-domain examples provided in the prompt +enables Codex to perform better than state-of-the-art models finetuned on such +few-shot examples. + +## Locally Typical Sampling + +- **arXiv id:** 2202.00666v5 +- **Title:** Locally Typical Sampling +- **Authors:** Clara Meister, Tiago Pimentel, Gian Wiher, et al. +- **Published Date:** 2022-02-01 +- **URL:** http://arxiv.org/abs/2202.00666v5 + +- **LangChain API Reference:** [langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference.html#langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference), [langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint) + +**Abstract:** Today's probabilistic language generators fall short when it comes to +producing coherent and fluent text despite the fact that the underlying models +perform well under standard metrics, e.g., perplexity. This discrepancy has +puzzled the language generation community for the last few years. In this work, +we posit that the abstraction of natural language generation as a discrete +stochastic process--which allows for an information-theoretic analysis--can +provide new insights into the behavior of probabilistic language generators, +e.g., why high-probability texts can be dull or repetitive. Humans use language +as a means of communicating information, aiming to do so in a simultaneously +efficient and error-minimizing manner; in fact, psycholinguistics research +suggests humans choose each word in a string with this subconscious goal in +mind. We formally define the set of strings that meet this criterion: those for +which each word has an information content close to the expected information +content, i.e., the conditional entropy of our model. We then propose a simple +and efficient procedure for enforcing this criterion when generating from +probabilistic models, which we call locally typical sampling. Automatic and +human evaluations show that, in comparison to nucleus and top-k sampling, +locally typical sampling offers competitive performance (in both abstractive +summarization and story generation) in terms of quality while consistently +reducing degenerate repetitions. + +## Learning Transferable Visual Models From Natural Language Supervision + +- **arXiv id:** 2103.00020v1 +- **Title:** Learning Transferable Visual Models From Natural Language Supervision +- **Authors:** Alec Radford, Jong Wook Kim, Chris Hallacy, et al. +- **Published Date:** 2021-02-26 +- **URL:** http://arxiv.org/abs/2103.00020v1 + +- **LangChain API Reference:** [langchain_experimental.open_clip](https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.open_clip) + +**Abstract:** State-of-the-art computer vision systems are trained to predict a fixed set +of predetermined object categories. This restricted form of supervision limits +their generality and usability since additional labeled data is needed to +specify any other visual concept. Learning directly from raw text about images +is a promising alternative which leverages a much broader source of +supervision. We demonstrate that the simple pre-training task of predicting +which caption goes with which image is an efficient and scalable way to learn +SOTA image representations from scratch on a dataset of 400 million (image, +text) pairs collected from the internet. After pre-training, natural language +is used to reference learned visual concepts (or describe new ones) enabling +zero-shot transfer of the model to downstream tasks. We study the performance +of this approach by benchmarking on over 30 different existing computer vision +datasets, spanning tasks such as OCR, action recognition in videos, +geo-localization, and many types of fine-grained object classification. The +model transfers non-trivially to most tasks and is often competitive with a +fully supervised baseline without the need for any dataset specific training. +For instance, we match the accuracy of the original ResNet-50 on ImageNet +zero-shot without needing to use any of the 1.28 million training examples it +was trained on. We release our code and pre-trained model weights at +https://github.com/OpenAI/CLIP. + +## CTRL: A Conditional Transformer Language Model for Controllable Generation + +- **arXiv id:** 1909.05858v2 +- **Title:** CTRL: A Conditional Transformer Language Model for Controllable Generation +- **Authors:** Nitish Shirish Keskar, Bryan McCann, Lav R. Varshney, et al. +- **Published Date:** 2019-09-11 +- **URL:** http://arxiv.org/abs/1909.05858v2 + +- **LangChain API Reference:** [langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference.html#langchain_community.llms.huggingface_text_gen_inference.HuggingFaceTextGenInference), [langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint.html#langchain_community.llms.huggingface_endpoint.HuggingFaceEndpoint) + +**Abstract:** Large-scale language models show promising text generation capabilities, but +users cannot easily control particular aspects of the generated text. We +release CTRL, a 1.63 billion-parameter conditional transformer language model, +trained to condition on control codes that govern style, content, and +task-specific behavior. Control codes were derived from structure that +naturally co-occurs with raw text, preserving the advantages of unsupervised +learning while providing more explicit control over text generation. These +codes also allow CTRL to predict which parts of the training data are most +likely given a sequence. This provides a potential method for analyzing large +amounts of data via model-based source attribution. We have released multiple +full-sized, pretrained versions of CTRL at https://github.com/salesforce/ctrl. + +## Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks + +- **arXiv id:** 1908.10084v1 +- **Title:** Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks +- **Authors:** Nils Reimers, Iryna Gurevych +- **Published Date:** 2019-08-27 +- **URL:** http://arxiv.org/abs/1908.10084v1 +- **LangChain Documentation:** [docs/integrations/text_embedding/sentence_transformers](https://python.langchain.com/docs/integrations/text_embedding/sentence_transformers) + + +**Abstract:** BERT (Devlin et al., 2018) and RoBERTa (Liu et al., 2019) has set a new +state-of-the-art performance on sentence-pair regression tasks like semantic +textual similarity (STS). However, it requires that both sentences are fed into +the network, which causes a massive computational overhead: Finding the most +similar pair in a collection of 10,000 sentences requires about 50 million +inference computations (~65 hours) with BERT. The construction of BERT makes it +unsuitable for semantic similarity search as well as for unsupervised tasks +like clustering. + In this publication, we present Sentence-BERT (SBERT), a modification of the +pretrained BERT network that use siamese and triplet network structures to +derive semantically meaningful sentence embeddings that can be compared using +cosine-similarity. This reduces the effort for finding the most similar pair +from 65 hours with BERT / RoBERTa to about 5 seconds with SBERT, while +maintaining the accuracy from BERT. + We evaluate SBERT and SRoBERTa on common STS tasks and transfer learning +tasks, where it outperforms other state-of-the-art sentence embeddings methods. + \ No newline at end of file diff --git a/docs/docusaurus.config.js b/docs/docusaurus.config.js index a9ef56bb35..2d0e793e8c 100644 --- a/docs/docusaurus.config.js +++ b/docs/docusaurus.config.js @@ -202,6 +202,10 @@ const config = { docId: "additional_resources/youtube", label: "YouTube" }, + { + to: "/docs/additional_resources/arxiv_references", + label: "arXiv" + }, ] }, { diff --git a/docs/scripts/arxiv_references.py b/docs/scripts/arxiv_references.py new file mode 100644 index 0000000000..55d9169d83 --- /dev/null +++ b/docs/scripts/arxiv_references.py @@ -0,0 +1,465 @@ +"""Parse arXiv references from the documentation. +Generate a page with a table of the arXiv references with links to the documentation pages. +""" + +import logging +import os +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Set + +from pydantic.v1 import BaseModel, root_validator + +# TODO parse docstrings for arXiv references +# TODO Generate a page with a table of the references with correspondent modules/classes/functions. + +logger = logging.getLogger(__name__) + +_ROOT_DIR = Path(os.path.abspath(__file__)).parents[2] +DOCS_DIR = _ROOT_DIR / "docs" / "docs" +CODE_DIR = _ROOT_DIR / "libs" +ARXIV_ID_PATTERN = r"https://arxiv\.org/(abs|pdf)/(\d+\.\d+)" + + +@dataclass +class ArxivPaper: + """ArXiv paper information.""" + + arxiv_id: str + referencing_docs: list[str] # TODO: Add the referencing docs + referencing_api_refs: list[str] # TODO: Add the referencing docs + title: str + authors: list[str] + abstract: str + url: str + published_date: str + + +def search_documentation_for_arxiv_references(docs_dir: Path) -> dict[str, set[str]]: + """Search the documentation for arXiv references. + + Search for the arXiv references in the documentation pages. + Note: It finds only the first arXiv reference in a line. + + Args: + docs_dir: Path to the documentation root folder. + Returns: + dict: Dictionary with arxiv_id as key and set of file names as value. + """ + arxiv_url_pattern = re.compile(ARXIV_ID_PATTERN) + exclude_strings = {"file_path", "metadata", "link", "loader", "PyPDFLoader"} + + # loop all the files (ipynb, mdx, md) in the docs folder + files = ( + p.resolve() + for p in Path(docs_dir).glob("**/*") + if p.suffix in {".ipynb", ".mdx", ".md"} + ) + arxiv_id2file_names: dict[str, set[str]] = {} + for file in files: + if "-checkpoint.ipynb" in file.name: + continue + with open(file, "r", encoding="utf-8") as f: + lines = f.readlines() + for line in lines: + if any(exclude_string in line for exclude_string in exclude_strings): + continue + matches = arxiv_url_pattern.search(line) + if matches: + arxiv_id = matches.group(2) + file_name = _get_doc_path(file.parts, file.suffix) + if arxiv_id not in arxiv_id2file_names: + arxiv_id2file_names[arxiv_id] = {file_name} + else: + arxiv_id2file_names[arxiv_id].add(file_name) + return arxiv_id2file_names + + +def convert_module_name_and_members_to_urls( + arxiv_id2module_name_and_members: dict[str, set[str]], +) -> dict[str, set[str]]: + arxiv_id2urls = {} + for arxiv_id, module_name_and_members in arxiv_id2module_name_and_members.items(): + urls = set() + for module_name_and_member in module_name_and_members: + module_name, type_and_member = module_name_and_member.split(":") + if "$" in type_and_member: + type, member = type_and_member.split("$") + else: + type = type_and_member + member = "" + _namespace_parts = module_name.split(".") + if type == "module": + first_namespace_part = _namespace_parts[0] + if first_namespace_part.startswith("langchain_"): + first_namespace_part = first_namespace_part.replace( + "langchain_", "" + ) + url = f"{first_namespace_part}_api_reference.html#module-{module_name}" + elif type in ["class", "function"]: + second_namespace_part = _namespace_parts[1] + url = f"{second_namespace_part}/{module_name}.{member}.html#{module_name}.{member}" + else: + raise ValueError( + f"Unknown type: {type} in the {module_name_and_member}." + ) + urls.add(url) + arxiv_id2urls[arxiv_id] = urls + return arxiv_id2urls + + +def search_code_for_arxiv_references(code_dir: Path) -> dict[str, set[str]]: + """Search the code for arXiv references. + + Search for the arXiv references in the code. + Note: It finds only the first arXiv reference in a line. + + Args: + code_dir: Path to the code root folder. + Returns: + dict: Dictionary with arxiv_id as key and set of module names as value. + module names encoded as: + :module + :class$ + :function$ + """ + arxiv_url_pattern = re.compile(ARXIV_ID_PATTERN) + # exclude_strings = {"file_path", "metadata", "link", "loader"} + class_pattern = re.compile(r"\s*class\s+(\w+).*:") + function_pattern = re.compile(r"\s*def\s+(\w+)") + + # loop all the files (ipynb, mdx, md) in the docs folder + files = ( + p.resolve() + for p in Path(code_dir).glob("**/*") + if p.suffix in {".py"} and "tests" not in p.parts and "scripts" not in p.parts + # ".md" files are excluded + ) + arxiv_id2module_name_and_members: dict[str, set[str]] = {} + for file in files: + try: + with open(file, "r", encoding="utf-8") as f: + module_name = _get_module_name(file.parts) + class_or_function_started = "module" + for line in f.readlines(): + # class line: + matches = class_pattern.search(line) + if matches: + class_name = matches.group(1) + class_or_function_started = f"class${class_name}" + + # function line: + # not inside a class! + if "class" not in class_or_function_started: + matches = function_pattern.search(line) + if matches: + func_name = matches.group(1) + class_or_function_started = f"function${func_name}" + + # arxiv line: + matches = arxiv_url_pattern.search(line) + if matches: + arxiv_id = matches.group(2) + module_name_and_member = ( + f"{module_name}:{class_or_function_started}" + ) + if arxiv_id not in arxiv_id2module_name_and_members: + arxiv_id2module_name_and_members[arxiv_id] = { + module_name_and_member + } + else: + arxiv_id2module_name_and_members[arxiv_id].add( + module_name_and_member + ) + except UnicodeDecodeError: + # Skip files like this 'tests/integration_tests/examples/non-utf8-encoding.py' + logger.warning(f"Could not read the file {file}.") + + # handle border cases: + # 1. {'langchain_experimental.pal_chain.base:class$PALChain', 'langchain_experimental.pal_chain.base:module' - remove} + for arxiv_id, module_name_and_members in arxiv_id2module_name_and_members.items(): + module_name_and_member_deduplicated = set() + non_module_members = set() + for module_name_and_member in module_name_and_members: + if not module_name_and_member.endswith(":module"): + module_name_and_member_deduplicated.add(module_name_and_member) + non_module_members.add(module_name_and_member.split(":")[0]) + for module_name_and_member in module_name_and_members: + if module_name_and_member.endswith(":module"): + if module_name_and_member.split(":")[0] in non_module_members: + continue + module_name_and_member_deduplicated.add(module_name_and_member) + arxiv_id2module_name_and_members[arxiv_id] = module_name_and_member_deduplicated + + # 2. {'langchain.evaluation.scoring.prompt:module', 'langchain.evaluation.comparison.prompt:module'} + # only modules with 2-part namespaces are parsed into API Reference now! TODO fix this behavior + # leave only the modules with 2-part namespaces + arxiv_id2module_name_and_members_reduced = {} + for arxiv_id, module_name_and_members in arxiv_id2module_name_and_members.items(): + module_name_and_member_reduced = set() + removed_modules = set() + for module_name_and_member in module_name_and_members: + if module_name_and_member.endswith(":module"): + if module_name_and_member.split(":")[0].count(".") <= 1: + module_name_and_member_reduced.add(module_name_and_member) + else: + removed_modules.add(module_name_and_member) + else: + module_name_and_member_reduced.add(module_name_and_member) + if module_name_and_member_reduced: + arxiv_id2module_name_and_members_reduced[arxiv_id] = ( + module_name_and_member_reduced + ) + if removed_modules: + logger.warning( + f"{arxiv_id}: Removed the following modules with 2+ -part namespaces: {removed_modules}." + ) + return arxiv_id2module_name_and_members_reduced + + +def _get_doc_path(file_parts: tuple[str, ...], file_extension) -> str: + """Get the relative path to the documentation page + from the absolute path of the file. + Remove file_extension + """ + res = [] + for el in file_parts[::-1]: + res.append(el) + if el == "docs": + break + ret = "/".join(reversed(res)) + return ret[: -len(file_extension)] if ret.endswith(file_extension) else ret + + +def _get_code_path(file_parts: tuple[str, ...]) -> str: + """Get the relative path to the documentation page + from the absolute path of the file. + """ + res = [] + for el in file_parts[::-1]: + res.append(el) + if el == "libs": + break + return "/".join(reversed(res)) + + +def _get_module_name(file_parts: tuple[str, ...]) -> str: + """Get the module name from the absolute path of the file.""" + ns_parts = [] + for el in file_parts[::-1]: + if str(el) == "__init__.py": + continue + ns_parts.insert(0, str(el).replace(".py", "")) + if el.startswith("langchain"): + break + return ".".join(ns_parts) + + +def compound_urls( + arxiv_id2file_names: dict[str, set[str]], arxiv_id2code_urls: dict[str, set[str]] +) -> dict[str, dict[str, set[str]]]: + arxiv_id2urls = dict() + for arxiv_id, code_urls in arxiv_id2code_urls.items(): + arxiv_id2urls[arxiv_id] = {"api": code_urls} + # intersection of the two sets + if arxiv_id in arxiv_id2file_names: + arxiv_id2urls[arxiv_id]["docs"] = arxiv_id2file_names[arxiv_id] + for arxiv_id, file_names in arxiv_id2file_names.items(): + if arxiv_id not in arxiv_id2code_urls: + arxiv_id2urls[arxiv_id] = {"docs": file_names} + # reverse sort by the arxiv_id (the newest papers first) + ret = dict(sorted(arxiv_id2urls.items(), key=lambda item: item[0], reverse=True)) + return ret + + +def _format_doc_link(doc_paths: list[str]) -> list[str]: + return [ + f"[{doc_path}](https://python.langchain.com/{doc_path})" + for doc_path in doc_paths + ] + + +def _format_api_ref_link( + doc_paths: list[str], compact: bool = False +) -> list[str]: # TODO + # agents/langchain_core.agents.AgentAction.html#langchain_core.agents.AgentAction + ret = [] + for doc_path in doc_paths: + module = doc_path.split("#")[1].replace("module-", "") + if compact and module.count(".") > 2: + # langchain_community.llms.oci_data_science_model_deployment_endpoint.OCIModelDeploymentTGI + # -> langchain_community.llms...OCIModelDeploymentTGI + module_parts = module.split(".") + module = f"{module_parts[0]}.{module_parts[1]}...{module_parts[-1]}" + ret.append( + f"[{module}](https://api.python.langchain.com/en/latest/{doc_path.split('langchain.com/')[-1]})" + ) + return ret + + +def log_results(arxiv_id2urls): + arxiv_ids = arxiv_id2urls.keys() + doc_number, api_number = 0, 0 + for urls in arxiv_id2urls.values(): + if "docs" in urls: + doc_number += len(urls["docs"]) + if "api" in urls: + api_number += len(urls["api"]) + logger.info( + f"Found {len(arxiv_ids)} arXiv references in the {doc_number} docs and in {api_number} API Refs." + ) + + +class ArxivAPIWrapper(BaseModel): + arxiv_search: Any #: :meta private: + arxiv_exceptions: Any # :meta private: + + @root_validator() + def validate_environment(cls, values: Dict) -> Dict: + """Validate that the python package exists in environment.""" + try: + import arxiv + + values["arxiv_search"] = arxiv.Search + values["arxiv_exceptions"] = ( + arxiv.ArxivError, + arxiv.UnexpectedEmptyPageError, + arxiv.HTTPError, + ) + except ImportError: + raise ImportError( + "Could not import arxiv python package. " + "Please install it with `pip install arxiv`." + ) + return values + + def get_papers( + self, arxiv_id2urls: dict[str, dict[str, set[str]]] + ) -> list[ArxivPaper]: + """ + Performs an arxiv search and returns information about the papers found. + + If an error occurs or no documents found, error text + is returned instead. + Args: + arxiv_id2urls: Dictionary with arxiv_id as key and dictionary + with sets of doc file names and API Ref urls. + + Returns: + List of ArxivPaper objects. + """ # noqa: E501 + + def cut_authors(authors: list) -> list[str]: + if len(authors) > 3: + return [str(a) for a in authors[:3]] + [" et al."] + else: + return [str(a) for a in authors] + + if not arxiv_id2urls: + return [] + try: + arxiv_ids = list(arxiv_id2urls.keys()) + results = self.arxiv_search( + id_list=arxiv_ids, + max_results=len(arxiv_ids), + ).results() + except self.arxiv_exceptions as ex: + raise ex + papers = [ + ArxivPaper( + arxiv_id=result.entry_id.split("/")[-1], + title=result.title, + authors=cut_authors(result.authors), + abstract=result.summary, + url=result.entry_id, + published_date=str(result.published.date()), + referencing_docs=urls["docs"] if "docs" in urls else [], + referencing_api_refs=urls["api"] if "api" in urls else [], + ) + for result, urls in zip(results, arxiv_id2urls.values()) + ] + return papers + + +def generate_arxiv_references_page(file_name: str, papers: list[ArxivPaper]) -> None: + with open(file_name, "w") as f: + # Write the table headers + f.write("""# arXiv + +LangChain implements the latest research in the field of Natural Language Processing. +This page contains `arXiv` papers referenced in the LangChain Documentation and API Reference. + +## Summary + +| arXiv id / Title | Authors | Published date 🔻 | LangChain Documentation and API Reference | +|------------------|---------|-------------------|-------------------------| +""") + for paper in papers: + refs = [] + if paper.referencing_docs: + refs += [ + "`Docs:` " + ", ".join(_format_doc_link(paper.referencing_docs)) + ] + if paper.referencing_api_refs: + refs += [ + "`API:` " + + ", ".join( + _format_api_ref_link(paper.referencing_api_refs, compact=True) + ) + ] + refs_str = ", ".join(refs) + + title_link = f"[{paper.title}]({paper.url})" + f.write( + f"| {' | '.join([f'`{paper.arxiv_id}` {title_link}', ', '.join(paper.authors), paper.published_date, refs_str])}\n" + ) + + for paper in papers: + docs_refs = ( + f"- **LangChain Documentation:** {', '.join(_format_doc_link(paper.referencing_docs))}" + if paper.referencing_docs + else "" + ) + api_ref_refs = ( + f"- **LangChain API Reference:** {', '.join(_format_api_ref_link(paper.referencing_api_refs))}" + if paper.referencing_api_refs + else "" + ) + f.write(f""" +## {paper.title} + +- **arXiv id:** {paper.arxiv_id} +- **Title:** {paper.title} +- **Authors:** {', '.join(paper.authors)} +- **Published Date:** {paper.published_date} +- **URL:** {paper.url} +{docs_refs} +{api_ref_refs} + +**Abstract:** {paper.abstract} + """) + + logger.info(f"Created the {file_name} file with {len(papers)} arXiv references.") + + +def main(): + # search the documentation and the API Reference for arXiv references: + arxiv_id2module_name_and_members = search_code_for_arxiv_references(CODE_DIR) + arxiv_id2code_urls = convert_module_name_and_members_to_urls( + arxiv_id2module_name_and_members + ) + arxiv_id2file_names = search_documentation_for_arxiv_references(DOCS_DIR) + arxiv_id2urls = compound_urls(arxiv_id2file_names, arxiv_id2code_urls) + log_results(arxiv_id2urls) + + # get the arXiv paper information + papers = ArxivAPIWrapper().get_papers(arxiv_id2urls) + + # generate the arXiv references page + output_file = str(DOCS_DIR / "additional_resources" / "arxiv_references.mdx") + generate_arxiv_references_page(output_file, papers) + + +if __name__ == "__main__": + main()