2024-05-17 18:28:57 +00:00
|
|
|
"""Parse arXiv references from the documentation.
|
|
|
|
Generate a page with a table of the arXiv references with links to the documentation pages.
|
|
|
|
"""
|
|
|
|
|
|
|
|
import logging
|
|
|
|
import os
|
|
|
|
import re
|
|
|
|
from dataclasses import dataclass
|
|
|
|
from pathlib import Path
|
2024-05-27 22:47:02 +00:00
|
|
|
from typing import Any, Dict
|
2024-05-17 18:28:57 +00:00
|
|
|
|
|
|
|
from pydantic.v1 import BaseModel, root_validator
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
_ROOT_DIR = Path(os.path.abspath(__file__)).parents[2]
|
|
|
|
DOCS_DIR = _ROOT_DIR / "docs" / "docs"
|
|
|
|
CODE_DIR = _ROOT_DIR / "libs"
|
2024-05-20 22:30:35 +00:00
|
|
|
TEMPLATES_DIR = _ROOT_DIR / "templates"
|
2024-05-27 22:47:02 +00:00
|
|
|
COOKBOOKS_DIR = _ROOT_DIR / "cookbook"
|
2024-05-17 18:28:57 +00:00
|
|
|
ARXIV_ID_PATTERN = r"https://arxiv\.org/(abs|pdf)/(\d+\.\d+)"
|
2024-05-20 22:30:35 +00:00
|
|
|
LANGCHAIN_PYTHON_URL = "python.langchain.com"
|
2024-05-17 18:28:57 +00:00
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class ArxivPaper:
|
|
|
|
"""ArXiv paper information."""
|
|
|
|
|
|
|
|
arxiv_id: str
|
2024-05-20 22:30:35 +00:00
|
|
|
referencing_doc2url: dict[str, str]
|
|
|
|
referencing_api_ref2url: dict[str, str]
|
|
|
|
referencing_template2url: dict[str, str]
|
2024-05-27 22:47:02 +00:00
|
|
|
referencing_cookbook2url: dict[str, str]
|
2024-05-17 18:28:57 +00:00
|
|
|
title: str
|
|
|
|
authors: list[str]
|
|
|
|
abstract: str
|
|
|
|
url: str
|
|
|
|
published_date: str
|
|
|
|
|
|
|
|
|
|
|
|
def search_documentation_for_arxiv_references(docs_dir: Path) -> dict[str, set[str]]:
|
|
|
|
"""Search the documentation for arXiv references.
|
|
|
|
|
|
|
|
Search for the arXiv references in the documentation pages.
|
|
|
|
Note: It finds only the first arXiv reference in a line.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
docs_dir: Path to the documentation root folder.
|
|
|
|
Returns:
|
|
|
|
dict: Dictionary with arxiv_id as key and set of file names as value.
|
|
|
|
"""
|
|
|
|
arxiv_url_pattern = re.compile(ARXIV_ID_PATTERN)
|
|
|
|
exclude_strings = {"file_path", "metadata", "link", "loader", "PyPDFLoader"}
|
|
|
|
|
|
|
|
files = (
|
|
|
|
p.resolve()
|
|
|
|
for p in Path(docs_dir).glob("**/*")
|
|
|
|
if p.suffix in {".ipynb", ".mdx", ".md"}
|
|
|
|
)
|
|
|
|
arxiv_id2file_names: dict[str, set[str]] = {}
|
|
|
|
for file in files:
|
|
|
|
if "-checkpoint.ipynb" in file.name:
|
|
|
|
continue
|
|
|
|
with open(file, "r", encoding="utf-8") as f:
|
|
|
|
lines = f.readlines()
|
|
|
|
for line in lines:
|
|
|
|
if any(exclude_string in line for exclude_string in exclude_strings):
|
|
|
|
continue
|
|
|
|
matches = arxiv_url_pattern.search(line)
|
|
|
|
if matches:
|
|
|
|
arxiv_id = matches.group(2)
|
|
|
|
file_name = _get_doc_path(file.parts, file.suffix)
|
|
|
|
if arxiv_id not in arxiv_id2file_names:
|
|
|
|
arxiv_id2file_names[arxiv_id] = {file_name}
|
|
|
|
else:
|
|
|
|
arxiv_id2file_names[arxiv_id].add(file_name)
|
|
|
|
return arxiv_id2file_names
|
|
|
|
|
|
|
|
|
|
|
|
def search_code_for_arxiv_references(code_dir: Path) -> dict[str, set[str]]:
|
|
|
|
"""Search the code for arXiv references.
|
|
|
|
|
|
|
|
Search for the arXiv references in the code.
|
|
|
|
Note: It finds only the first arXiv reference in a line.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
code_dir: Path to the code root folder.
|
|
|
|
Returns:
|
|
|
|
dict: Dictionary with arxiv_id as key and set of module names as value.
|
|
|
|
module names encoded as:
|
|
|
|
<module_name>:module
|
|
|
|
<module_name>:class$<ClassName>
|
|
|
|
<module_name>:function$<function_name>
|
|
|
|
"""
|
|
|
|
arxiv_url_pattern = re.compile(ARXIV_ID_PATTERN)
|
|
|
|
# exclude_strings = {"file_path", "metadata", "link", "loader"}
|
|
|
|
class_pattern = re.compile(r"\s*class\s+(\w+).*:")
|
|
|
|
function_pattern = re.compile(r"\s*def\s+(\w+)")
|
|
|
|
|
|
|
|
# loop all the files (ipynb, mdx, md) in the docs folder
|
|
|
|
files = (
|
|
|
|
p.resolve()
|
|
|
|
for p in Path(code_dir).glob("**/*")
|
|
|
|
if p.suffix in {".py"} and "tests" not in p.parts and "scripts" not in p.parts
|
|
|
|
# ".md" files are excluded
|
|
|
|
)
|
|
|
|
arxiv_id2module_name_and_members: dict[str, set[str]] = {}
|
|
|
|
for file in files:
|
|
|
|
try:
|
|
|
|
with open(file, "r", encoding="utf-8") as f:
|
|
|
|
module_name = _get_module_name(file.parts)
|
|
|
|
class_or_function_started = "module"
|
|
|
|
for line in f.readlines():
|
|
|
|
# class line:
|
|
|
|
matches = class_pattern.search(line)
|
|
|
|
if matches:
|
|
|
|
class_name = matches.group(1)
|
|
|
|
class_or_function_started = f"class${class_name}"
|
|
|
|
|
|
|
|
# function line:
|
|
|
|
# not inside a class!
|
|
|
|
if "class" not in class_or_function_started:
|
|
|
|
matches = function_pattern.search(line)
|
|
|
|
if matches:
|
|
|
|
func_name = matches.group(1)
|
|
|
|
class_or_function_started = f"function${func_name}"
|
|
|
|
|
|
|
|
# arxiv line:
|
|
|
|
matches = arxiv_url_pattern.search(line)
|
|
|
|
if matches:
|
|
|
|
arxiv_id = matches.group(2)
|
|
|
|
module_name_and_member = (
|
|
|
|
f"{module_name}:{class_or_function_started}"
|
|
|
|
)
|
|
|
|
if arxiv_id not in arxiv_id2module_name_and_members:
|
|
|
|
arxiv_id2module_name_and_members[arxiv_id] = {
|
|
|
|
module_name_and_member
|
|
|
|
}
|
|
|
|
else:
|
|
|
|
arxiv_id2module_name_and_members[arxiv_id].add(
|
|
|
|
module_name_and_member
|
|
|
|
)
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
# Skip files like this 'tests/integration_tests/examples/non-utf8-encoding.py'
|
|
|
|
logger.warning(f"Could not read the file {file}.")
|
|
|
|
|
|
|
|
# handle border cases:
|
|
|
|
# 1. {'langchain_experimental.pal_chain.base:class$PALChain', 'langchain_experimental.pal_chain.base:module' - remove}
|
|
|
|
for arxiv_id, module_name_and_members in arxiv_id2module_name_and_members.items():
|
|
|
|
module_name_and_member_deduplicated = set()
|
|
|
|
non_module_members = set()
|
|
|
|
for module_name_and_member in module_name_and_members:
|
|
|
|
if not module_name_and_member.endswith(":module"):
|
|
|
|
module_name_and_member_deduplicated.add(module_name_and_member)
|
|
|
|
non_module_members.add(module_name_and_member.split(":")[0])
|
|
|
|
for module_name_and_member in module_name_and_members:
|
|
|
|
if module_name_and_member.endswith(":module"):
|
|
|
|
if module_name_and_member.split(":")[0] in non_module_members:
|
|
|
|
continue
|
|
|
|
module_name_and_member_deduplicated.add(module_name_and_member)
|
|
|
|
arxiv_id2module_name_and_members[arxiv_id] = module_name_and_member_deduplicated
|
|
|
|
|
|
|
|
# 2. {'langchain.evaluation.scoring.prompt:module', 'langchain.evaluation.comparison.prompt:module'}
|
|
|
|
# only modules with 2-part namespaces are parsed into API Reference now! TODO fix this behavior
|
|
|
|
# leave only the modules with 2-part namespaces
|
|
|
|
arxiv_id2module_name_and_members_reduced = {}
|
|
|
|
for arxiv_id, module_name_and_members in arxiv_id2module_name_and_members.items():
|
|
|
|
module_name_and_member_reduced = set()
|
|
|
|
removed_modules = set()
|
|
|
|
for module_name_and_member in module_name_and_members:
|
|
|
|
if module_name_and_member.endswith(":module"):
|
|
|
|
if module_name_and_member.split(":")[0].count(".") <= 1:
|
|
|
|
module_name_and_member_reduced.add(module_name_and_member)
|
|
|
|
else:
|
|
|
|
removed_modules.add(module_name_and_member)
|
|
|
|
else:
|
|
|
|
module_name_and_member_reduced.add(module_name_and_member)
|
|
|
|
if module_name_and_member_reduced:
|
|
|
|
arxiv_id2module_name_and_members_reduced[arxiv_id] = (
|
|
|
|
module_name_and_member_reduced
|
|
|
|
)
|
|
|
|
if removed_modules:
|
|
|
|
logger.warning(
|
|
|
|
f"{arxiv_id}: Removed the following modules with 2+ -part namespaces: {removed_modules}."
|
|
|
|
)
|
|
|
|
return arxiv_id2module_name_and_members_reduced
|
|
|
|
|
|
|
|
|
2024-05-20 22:30:35 +00:00
|
|
|
def search_templates_for_arxiv_references(templates_dir: Path) -> dict[str, set[str]]:
|
|
|
|
arxiv_url_pattern = re.compile(ARXIV_ID_PATTERN)
|
|
|
|
|
|
|
|
# loop all the Readme.md files since they are parsed into LangChain documentation
|
|
|
|
# exclude the Readme.md in the root folder
|
|
|
|
files = (
|
|
|
|
p.resolve()
|
|
|
|
for p in Path(templates_dir).glob("**/*")
|
|
|
|
if p.name.lower() in {"readme.md"} and p.parent.name != "templates"
|
|
|
|
)
|
|
|
|
arxiv_id2template_names: dict[str, set[str]] = {}
|
|
|
|
for file in files:
|
|
|
|
with open(file, "r", encoding="utf-8") as f:
|
|
|
|
lines = f.readlines()
|
|
|
|
for line in lines:
|
|
|
|
matches = arxiv_url_pattern.search(line)
|
|
|
|
if matches:
|
|
|
|
arxiv_id = matches.group(2)
|
|
|
|
template_name = file.parent.name
|
|
|
|
if arxiv_id not in arxiv_id2template_names:
|
|
|
|
arxiv_id2template_names[arxiv_id] = {template_name}
|
|
|
|
else:
|
|
|
|
arxiv_id2template_names[arxiv_id].add(template_name)
|
|
|
|
return arxiv_id2template_names
|
|
|
|
|
|
|
|
|
2024-05-27 22:47:02 +00:00
|
|
|
def search_cookbooks_for_arxiv_references(cookbooks_dir: Path) -> dict[str, set[str]]:
|
|
|
|
arxiv_url_pattern = re.compile(ARXIV_ID_PATTERN)
|
|
|
|
files = (p.resolve() for p in Path(cookbooks_dir).glob("**/*.ipynb"))
|
|
|
|
arxiv_id2cookbook_names: dict[str, set[str]] = {}
|
|
|
|
for file in files:
|
|
|
|
with open(file, "r", encoding="utf-8") as f:
|
|
|
|
lines = f.readlines()
|
|
|
|
for line in lines:
|
|
|
|
matches = arxiv_url_pattern.search(line)
|
|
|
|
if matches:
|
|
|
|
arxiv_id = matches.group(2)
|
|
|
|
cookbook_name = file.stem
|
|
|
|
if arxiv_id not in arxiv_id2cookbook_names:
|
|
|
|
arxiv_id2cookbook_names[arxiv_id] = {cookbook_name}
|
|
|
|
else:
|
|
|
|
arxiv_id2cookbook_names[arxiv_id].add(cookbook_name)
|
|
|
|
return arxiv_id2cookbook_names
|
|
|
|
|
|
|
|
|
|
|
|
def convert_module_name_and_members_to_urls(
|
|
|
|
arxiv_id2module_name_and_members: dict[str, set[str]],
|
|
|
|
) -> dict[str, set[str]]:
|
|
|
|
arxiv_id2urls = {}
|
|
|
|
for arxiv_id, module_name_and_members in arxiv_id2module_name_and_members.items():
|
|
|
|
urls = set()
|
|
|
|
for module_name_and_member in module_name_and_members:
|
|
|
|
module_name, type_and_member = module_name_and_member.split(":")
|
|
|
|
if "$" in type_and_member:
|
|
|
|
type_, member = type_and_member.split("$")
|
|
|
|
else:
|
|
|
|
type_ = type_and_member
|
|
|
|
member = ""
|
|
|
|
_namespace_parts = module_name.split(".")
|
|
|
|
if type_ == "module":
|
|
|
|
first_namespace_part = _namespace_parts[0]
|
|
|
|
if first_namespace_part.startswith("langchain_"):
|
|
|
|
first_namespace_part = first_namespace_part.replace(
|
|
|
|
"langchain_", ""
|
|
|
|
)
|
|
|
|
url = f"{first_namespace_part}_api_reference.html#module-{module_name}"
|
|
|
|
elif type_ in ["class", "function"]:
|
|
|
|
second_namespace_part = _namespace_parts[1]
|
|
|
|
url = f"{second_namespace_part}/{module_name}.{member}.html#{module_name}.{member}"
|
|
|
|
else:
|
|
|
|
raise ValueError(
|
|
|
|
f"Unknown type: {type_} in the {module_name_and_member}."
|
|
|
|
)
|
|
|
|
urls.add(url)
|
|
|
|
arxiv_id2urls[arxiv_id] = urls
|
|
|
|
return arxiv_id2urls
|
|
|
|
|
|
|
|
|
2024-05-17 18:28:57 +00:00
|
|
|
def _get_doc_path(file_parts: tuple[str, ...], file_extension) -> str:
|
|
|
|
"""Get the relative path to the documentation page
|
|
|
|
from the absolute path of the file.
|
|
|
|
Remove file_extension
|
|
|
|
"""
|
|
|
|
res = []
|
|
|
|
for el in file_parts[::-1]:
|
|
|
|
res.append(el)
|
|
|
|
if el == "docs":
|
|
|
|
break
|
|
|
|
ret = "/".join(reversed(res))
|
|
|
|
return ret[: -len(file_extension)] if ret.endswith(file_extension) else ret
|
|
|
|
|
|
|
|
|
|
|
|
def _get_code_path(file_parts: tuple[str, ...]) -> str:
|
|
|
|
"""Get the relative path to the documentation page
|
|
|
|
from the absolute path of the file.
|
|
|
|
"""
|
|
|
|
res = []
|
|
|
|
for el in file_parts[::-1]:
|
|
|
|
res.append(el)
|
|
|
|
if el == "libs":
|
|
|
|
break
|
|
|
|
return "/".join(reversed(res))
|
|
|
|
|
|
|
|
|
|
|
|
def _get_module_name(file_parts: tuple[str, ...]) -> str:
|
|
|
|
"""Get the module name from the absolute path of the file."""
|
|
|
|
ns_parts = []
|
|
|
|
for el in file_parts[::-1]:
|
|
|
|
if str(el) == "__init__.py":
|
|
|
|
continue
|
|
|
|
ns_parts.insert(0, str(el).replace(".py", ""))
|
|
|
|
if el.startswith("langchain"):
|
|
|
|
break
|
|
|
|
return ".".join(ns_parts)
|
|
|
|
|
|
|
|
|
2024-05-20 22:30:35 +00:00
|
|
|
def _is_url_ok(url: str) -> bool:
|
|
|
|
"""Check if the url page is open without error."""
|
|
|
|
import requests
|
2024-05-17 18:28:57 +00:00
|
|
|
|
2024-05-20 22:30:35 +00:00
|
|
|
try:
|
|
|
|
response = requests.get(url)
|
|
|
|
response.raise_for_status()
|
|
|
|
except requests.exceptions.RequestException as ex:
|
|
|
|
logger.warning(f"Could not open the {url}.")
|
|
|
|
return False
|
|
|
|
return True
|
2024-05-17 18:28:57 +00:00
|
|
|
|
|
|
|
|
|
|
|
class ArxivAPIWrapper(BaseModel):
|
|
|
|
arxiv_search: Any #: :meta private:
|
|
|
|
arxiv_exceptions: Any # :meta private:
|
|
|
|
|
|
|
|
@root_validator()
|
|
|
|
def validate_environment(cls, values: Dict) -> Dict:
|
|
|
|
"""Validate that the python package exists in environment."""
|
|
|
|
try:
|
|
|
|
import arxiv
|
|
|
|
|
|
|
|
values["arxiv_search"] = arxiv.Search
|
|
|
|
values["arxiv_exceptions"] = (
|
|
|
|
arxiv.ArxivError,
|
|
|
|
arxiv.UnexpectedEmptyPageError,
|
|
|
|
arxiv.HTTPError,
|
|
|
|
)
|
|
|
|
except ImportError:
|
|
|
|
raise ImportError(
|
|
|
|
"Could not import arxiv python package. "
|
|
|
|
"Please install it with `pip install arxiv`."
|
|
|
|
)
|
|
|
|
return values
|
|
|
|
|
|
|
|
def get_papers(
|
2024-05-20 22:30:35 +00:00
|
|
|
self, arxiv_id2type2key2urls: dict[str, dict[str, dict[str, str]]]
|
2024-05-17 18:28:57 +00:00
|
|
|
) -> list[ArxivPaper]:
|
|
|
|
"""
|
|
|
|
Performs an arxiv search and returns information about the papers found.
|
|
|
|
|
|
|
|
If an error occurs or no documents found, error text
|
|
|
|
is returned instead.
|
|
|
|
Args:
|
2024-05-20 22:30:35 +00:00
|
|
|
arxiv_id2type2key2urls: Dictionary with arxiv_id as key and dictionary
|
|
|
|
with dicts of doc file names/API objects/templates to urls.
|
2024-05-17 18:28:57 +00:00
|
|
|
|
|
|
|
Returns:
|
|
|
|
List of ArxivPaper objects.
|
2024-05-22 22:21:08 +00:00
|
|
|
"""
|
2024-05-17 18:28:57 +00:00
|
|
|
|
|
|
|
def cut_authors(authors: list) -> list[str]:
|
|
|
|
if len(authors) > 3:
|
|
|
|
return [str(a) for a in authors[:3]] + [" et al."]
|
|
|
|
else:
|
|
|
|
return [str(a) for a in authors]
|
|
|
|
|
2024-05-20 22:30:35 +00:00
|
|
|
if not arxiv_id2type2key2urls:
|
2024-05-17 18:28:57 +00:00
|
|
|
return []
|
|
|
|
try:
|
2024-05-20 22:30:35 +00:00
|
|
|
arxiv_ids = list(arxiv_id2type2key2urls.keys())
|
2024-05-17 18:28:57 +00:00
|
|
|
results = self.arxiv_search(
|
|
|
|
id_list=arxiv_ids,
|
|
|
|
max_results=len(arxiv_ids),
|
|
|
|
).results()
|
|
|
|
except self.arxiv_exceptions as ex:
|
|
|
|
raise ex
|
|
|
|
papers = [
|
|
|
|
ArxivPaper(
|
|
|
|
arxiv_id=result.entry_id.split("/")[-1],
|
|
|
|
title=result.title,
|
|
|
|
authors=cut_authors(result.authors),
|
|
|
|
abstract=result.summary,
|
|
|
|
url=result.entry_id,
|
|
|
|
published_date=str(result.published.date()),
|
2024-05-20 22:30:35 +00:00
|
|
|
referencing_doc2url=type2key2urls["docs"]
|
|
|
|
if "docs" in type2key2urls
|
|
|
|
else {},
|
|
|
|
referencing_api_ref2url=type2key2urls["apis"]
|
|
|
|
if "apis" in type2key2urls
|
|
|
|
else {},
|
|
|
|
referencing_template2url=type2key2urls["templates"]
|
|
|
|
if "templates" in type2key2urls
|
|
|
|
else {},
|
2024-05-27 22:47:02 +00:00
|
|
|
referencing_cookbook2url=type2key2urls["cookbooks"]
|
|
|
|
if "cookbooks" in type2key2urls
|
|
|
|
else {},
|
2024-05-17 18:28:57 +00:00
|
|
|
)
|
2024-05-20 22:30:35 +00:00
|
|
|
for result, type2key2urls in zip(results, arxiv_id2type2key2urls.values())
|
2024-05-17 18:28:57 +00:00
|
|
|
]
|
|
|
|
return papers
|
|
|
|
|
|
|
|
|
2024-05-20 22:30:35 +00:00
|
|
|
def _format_doc_url(doc_path: str) -> str:
|
|
|
|
return f"https://{LANGCHAIN_PYTHON_URL}/{doc_path}"
|
|
|
|
|
|
|
|
|
|
|
|
def _format_api_ref_url(doc_path: str, compact: bool = False) -> str:
|
|
|
|
# agents/langchain_core.agents.AgentAction.html#langchain_core.agents.AgentAction
|
|
|
|
return f"https://api.{LANGCHAIN_PYTHON_URL}/en/latest/{doc_path.split('langchain.com/')[-1]}"
|
|
|
|
|
|
|
|
|
|
|
|
def _format_template_url(template_name: str) -> str:
|
|
|
|
return f"https://{LANGCHAIN_PYTHON_URL}/docs/templates/{template_name}"
|
|
|
|
|
|
|
|
|
2024-05-27 22:47:02 +00:00
|
|
|
def _format_cookbook_url(cookbook_name: str) -> str:
|
|
|
|
return f"https://github.com/langchain-ai/langchain/blob/master/cookbook/{cookbook_name}.ipynb"
|
|
|
|
|
|
|
|
|
2024-05-20 22:30:35 +00:00
|
|
|
def _compact_module_full_name(doc_path: str) -> str:
|
|
|
|
# agents/langchain_core.agents.AgentAction.html#langchain_core.agents.AgentAction
|
|
|
|
module = doc_path.split("#")[1].replace("module-", "")
|
|
|
|
if module.count(".") > 2:
|
|
|
|
# langchain_community.llms.oci_data_science_model_deployment_endpoint.OCIModelDeploymentTGI
|
2024-06-06 23:51:02 +00:00
|
|
|
# -> langchain_community...OCIModelDeploymentTGI
|
2024-05-20 22:30:35 +00:00
|
|
|
module_parts = module.split(".")
|
2024-06-06 23:51:02 +00:00
|
|
|
module = f"{module_parts[0]}...{module_parts[-1]}"
|
2024-05-20 22:30:35 +00:00
|
|
|
return module
|
|
|
|
|
|
|
|
|
2024-05-27 22:47:02 +00:00
|
|
|
def compound_urls(
|
|
|
|
arxiv_id2file_names: dict[str, set[str]],
|
|
|
|
arxiv_id2code_urls: dict[str, set[str]],
|
|
|
|
arxiv_id2templates: dict[str, set[str]],
|
|
|
|
arxiv_id2cookbooks: dict[str, set[str]],
|
|
|
|
) -> dict[str, dict[str, set[str]]]:
|
|
|
|
# format urls and verify that the urls are correct
|
|
|
|
arxiv_id2file_names_new = {}
|
|
|
|
for arxiv_id, file_names in arxiv_id2file_names.items():
|
|
|
|
key2urls = {
|
|
|
|
key: _format_doc_url(key)
|
|
|
|
for key in file_names
|
|
|
|
if _is_url_ok(_format_doc_url(key))
|
|
|
|
}
|
|
|
|
if key2urls:
|
|
|
|
arxiv_id2file_names_new[arxiv_id] = key2urls
|
|
|
|
|
|
|
|
arxiv_id2code_urls_new = {}
|
|
|
|
for arxiv_id, code_urls in arxiv_id2code_urls.items():
|
|
|
|
key2urls = {
|
|
|
|
key: _format_api_ref_url(key)
|
|
|
|
for key in code_urls
|
|
|
|
if _is_url_ok(_format_api_ref_url(key))
|
|
|
|
}
|
|
|
|
if key2urls:
|
|
|
|
arxiv_id2code_urls_new[arxiv_id] = key2urls
|
|
|
|
|
|
|
|
arxiv_id2templates_new = {}
|
|
|
|
for arxiv_id, templates in arxiv_id2templates.items():
|
|
|
|
key2urls = {
|
|
|
|
key: _format_template_url(key)
|
|
|
|
for key in templates
|
|
|
|
if _is_url_ok(_format_template_url(key))
|
|
|
|
}
|
|
|
|
if key2urls:
|
|
|
|
arxiv_id2templates_new[arxiv_id] = key2urls
|
|
|
|
|
|
|
|
arxiv_id2cookbooks_new = {}
|
|
|
|
for arxiv_id, cookbooks in arxiv_id2cookbooks.items():
|
|
|
|
key2urls = {
|
|
|
|
key: _format_cookbook_url(key)
|
|
|
|
for key in cookbooks
|
|
|
|
if _is_url_ok(_format_cookbook_url(key))
|
|
|
|
}
|
|
|
|
if key2urls:
|
|
|
|
arxiv_id2cookbooks_new[arxiv_id] = key2urls
|
|
|
|
|
|
|
|
arxiv_id2type2key2urls = dict.fromkeys(
|
|
|
|
arxiv_id2file_names_new
|
|
|
|
| arxiv_id2code_urls_new
|
|
|
|
| arxiv_id2templates_new
|
|
|
|
| arxiv_id2cookbooks_new
|
|
|
|
)
|
|
|
|
arxiv_id2type2key2urls = {k: {} for k in arxiv_id2type2key2urls}
|
|
|
|
for arxiv_id, key2urls in arxiv_id2file_names_new.items():
|
|
|
|
arxiv_id2type2key2urls[arxiv_id]["docs"] = key2urls
|
|
|
|
for arxiv_id, key2urls in arxiv_id2code_urls_new.items():
|
|
|
|
arxiv_id2type2key2urls[arxiv_id]["apis"] = key2urls
|
|
|
|
for arxiv_id, key2urls in arxiv_id2templates_new.items():
|
|
|
|
arxiv_id2type2key2urls[arxiv_id]["templates"] = key2urls
|
|
|
|
for arxiv_id, key2urls in arxiv_id2cookbooks_new.items():
|
|
|
|
arxiv_id2type2key2urls[arxiv_id]["cookbooks"] = key2urls
|
|
|
|
|
|
|
|
# reverse sort by the arxiv_id (the newest papers first)
|
|
|
|
ret = dict(
|
|
|
|
sorted(arxiv_id2type2key2urls.items(), key=lambda item: item[0], reverse=True)
|
|
|
|
)
|
|
|
|
return ret
|
|
|
|
|
|
|
|
|
2024-05-20 22:30:35 +00:00
|
|
|
def log_results(arxiv_id2type2key2urls):
|
|
|
|
arxiv_ids = arxiv_id2type2key2urls.keys()
|
2024-05-27 22:47:02 +00:00
|
|
|
doc_number, api_number, templates_number, cookbooks_number = 0, 0, 0, 0
|
2024-05-20 22:30:35 +00:00
|
|
|
for type2key2url in arxiv_id2type2key2urls.values():
|
|
|
|
if "docs" in type2key2url:
|
|
|
|
doc_number += len(type2key2url["docs"])
|
|
|
|
if "apis" in type2key2url:
|
|
|
|
api_number += len(type2key2url["apis"])
|
|
|
|
if "templates" in type2key2url:
|
|
|
|
templates_number += len(type2key2url["templates"])
|
2024-05-27 22:47:02 +00:00
|
|
|
if "cookbooks" in type2key2url:
|
|
|
|
cookbooks_number += len(type2key2url["cookbooks"])
|
2024-05-20 22:30:35 +00:00
|
|
|
logger.warning(
|
|
|
|
f"Found {len(arxiv_ids)} arXiv references in the {doc_number} docs, {api_number} API Refs,"
|
2024-05-27 22:47:02 +00:00
|
|
|
f" {templates_number} Templates, and {cookbooks_number} Cookbooks."
|
2024-05-20 22:30:35 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def generate_arxiv_references_page(file_name: Path, papers: list[ArxivPaper]) -> None:
|
2024-05-17 18:28:57 +00:00
|
|
|
with open(file_name, "w") as f:
|
|
|
|
# Write the table headers
|
embeddings: nomic embed vision (#22482)
Thank you for contributing to LangChain!
**Description:** Adds Langchain support for Nomic Embed Vision
**Twitter handle:** nomic_ai,zach_nussbaum
- [x] **Add tests and docs**: If you're adding a new integration, please
include
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.
- [ ] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/
Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.
If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
---------
Co-authored-by: Lance Martin <122662504+rlancemartin@users.noreply.github.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
2024-06-05 16:47:17 +00:00
|
|
|
f.write(
|
|
|
|
"""# arXiv
|
2024-05-17 18:28:57 +00:00
|
|
|
|
|
|
|
LangChain implements the latest research in the field of Natural Language Processing.
|
2024-05-20 22:30:35 +00:00
|
|
|
This page contains `arXiv` papers referenced in the LangChain Documentation, API Reference,
|
2024-05-27 22:47:02 +00:00
|
|
|
Templates, and Cookbooks.
|
2024-05-17 18:28:57 +00:00
|
|
|
|
2024-06-06 23:51:02 +00:00
|
|
|
From the opposite direction, scientists use LangChain in research and reference LangChain in the research papers.
|
|
|
|
Here you find [such papers](https://arxiv.org/search/?query=langchain&searchtype=all&source=header).
|
|
|
|
|
2024-05-17 18:28:57 +00:00
|
|
|
## Summary
|
|
|
|
|
2024-05-20 22:30:35 +00:00
|
|
|
| arXiv id / Title | Authors | Published date 🔻 | LangChain Documentation|
|
|
|
|
|------------------|---------|-------------------|------------------------|
|
embeddings: nomic embed vision (#22482)
Thank you for contributing to LangChain!
**Description:** Adds Langchain support for Nomic Embed Vision
**Twitter handle:** nomic_ai,zach_nussbaum
- [x] **Add tests and docs**: If you're adding a new integration, please
include
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.
- [ ] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/
Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.
If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
---------
Co-authored-by: Lance Martin <122662504+rlancemartin@users.noreply.github.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
2024-06-05 16:47:17 +00:00
|
|
|
"""
|
|
|
|
)
|
2024-05-17 18:28:57 +00:00
|
|
|
for paper in papers:
|
|
|
|
refs = []
|
2024-05-20 22:30:35 +00:00
|
|
|
if paper.referencing_doc2url:
|
2024-05-17 18:28:57 +00:00
|
|
|
refs += [
|
2024-05-20 22:30:35 +00:00
|
|
|
"`Docs:` "
|
|
|
|
+ ", ".join(
|
|
|
|
f"[{key}]({url})"
|
|
|
|
for key, url in paper.referencing_doc2url.items()
|
|
|
|
)
|
2024-05-17 18:28:57 +00:00
|
|
|
]
|
2024-05-20 22:30:35 +00:00
|
|
|
if paper.referencing_api_ref2url:
|
2024-05-17 18:28:57 +00:00
|
|
|
refs += [
|
|
|
|
"`API:` "
|
|
|
|
+ ", ".join(
|
2024-05-20 22:30:35 +00:00
|
|
|
f"[{_compact_module_full_name(key)}]({url})"
|
|
|
|
for key, url in paper.referencing_api_ref2url.items()
|
|
|
|
)
|
|
|
|
]
|
|
|
|
if paper.referencing_template2url:
|
|
|
|
refs += [
|
|
|
|
"`Template:` "
|
|
|
|
+ ", ".join(
|
|
|
|
f"[{key}]({url})"
|
|
|
|
for key, url in paper.referencing_template2url.items()
|
2024-05-17 18:28:57 +00:00
|
|
|
)
|
|
|
|
]
|
2024-05-27 22:47:02 +00:00
|
|
|
if paper.referencing_cookbook2url:
|
|
|
|
refs += [
|
|
|
|
"`Cookbook:` "
|
|
|
|
+ ", ".join(
|
|
|
|
f"[{key}]({url})"
|
|
|
|
for key, url in paper.referencing_cookbook2url.items()
|
|
|
|
)
|
|
|
|
]
|
2024-05-17 18:28:57 +00:00
|
|
|
refs_str = ", ".join(refs)
|
|
|
|
|
|
|
|
title_link = f"[{paper.title}]({paper.url})"
|
|
|
|
f.write(
|
|
|
|
f"| {' | '.join([f'`{paper.arxiv_id}` {title_link}', ', '.join(paper.authors), paper.published_date, refs_str])}\n"
|
|
|
|
)
|
|
|
|
|
|
|
|
for paper in papers:
|
|
|
|
docs_refs = (
|
2024-05-20 22:30:35 +00:00
|
|
|
f" - **Documentation:** {', '.join(f'[{key}]({url})' for key, url in paper.referencing_doc2url.items())}"
|
|
|
|
if paper.referencing_doc2url
|
2024-05-17 18:28:57 +00:00
|
|
|
else ""
|
|
|
|
)
|
|
|
|
api_ref_refs = (
|
2024-05-20 22:30:35 +00:00
|
|
|
f" - **API Reference:** {', '.join(f'[{_compact_module_full_name(key)}]({url})' for key, url in paper.referencing_api_ref2url.items())}"
|
|
|
|
if paper.referencing_api_ref2url
|
|
|
|
else ""
|
|
|
|
)
|
|
|
|
template_refs = (
|
|
|
|
f" - **Template:** {', '.join(f'[{key}]({url})' for key, url in paper.referencing_template2url.items())}"
|
|
|
|
if paper.referencing_template2url
|
2024-05-17 18:28:57 +00:00
|
|
|
else ""
|
|
|
|
)
|
2024-05-27 22:47:02 +00:00
|
|
|
cookbook_refs = (
|
|
|
|
f" - **Cookbook:** {', '.join(f'[{key}]({url})' for key, url in paper.referencing_cookbook2url.items())}"
|
|
|
|
if paper.referencing_cookbook2url
|
|
|
|
else ""
|
|
|
|
)
|
2024-05-20 22:30:35 +00:00
|
|
|
refs = "\n".join(
|
2024-05-27 22:47:02 +00:00
|
|
|
[
|
|
|
|
el
|
|
|
|
for el in [docs_refs, api_ref_refs, template_refs, cookbook_refs]
|
|
|
|
if el
|
|
|
|
]
|
2024-05-20 22:30:35 +00:00
|
|
|
)
|
embeddings: nomic embed vision (#22482)
Thank you for contributing to LangChain!
**Description:** Adds Langchain support for Nomic Embed Vision
**Twitter handle:** nomic_ai,zach_nussbaum
- [x] **Add tests and docs**: If you're adding a new integration, please
include
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.
- [ ] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/
Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.
If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
---------
Co-authored-by: Lance Martin <122662504+rlancemartin@users.noreply.github.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
2024-06-05 16:47:17 +00:00
|
|
|
f.write(
|
|
|
|
f"""
|
2024-05-17 18:28:57 +00:00
|
|
|
## {paper.title}
|
|
|
|
|
|
|
|
- **arXiv id:** {paper.arxiv_id}
|
|
|
|
- **Title:** {paper.title}
|
|
|
|
- **Authors:** {', '.join(paper.authors)}
|
|
|
|
- **Published Date:** {paper.published_date}
|
|
|
|
- **URL:** {paper.url}
|
2024-05-20 22:30:35 +00:00
|
|
|
- **LangChain:**
|
|
|
|
|
|
|
|
{refs}
|
2024-05-17 18:28:57 +00:00
|
|
|
|
|
|
|
**Abstract:** {paper.abstract}
|
embeddings: nomic embed vision (#22482)
Thank you for contributing to LangChain!
**Description:** Adds Langchain support for Nomic Embed Vision
**Twitter handle:** nomic_ai,zach_nussbaum
- [x] **Add tests and docs**: If you're adding a new integration, please
include
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.
- [ ] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/
Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.
If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
---------
Co-authored-by: Lance Martin <122662504+rlancemartin@users.noreply.github.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
2024-06-05 16:47:17 +00:00
|
|
|
"""
|
|
|
|
)
|
2024-05-17 18:28:57 +00:00
|
|
|
|
2024-05-20 22:30:35 +00:00
|
|
|
logger.warning(f"Created the {file_name} file with {len(papers)} arXiv references.")
|
2024-05-17 18:28:57 +00:00
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
# search the documentation and the API Reference for arXiv references:
|
|
|
|
arxiv_id2module_name_and_members = search_code_for_arxiv_references(CODE_DIR)
|
|
|
|
arxiv_id2code_urls = convert_module_name_and_members_to_urls(
|
|
|
|
arxiv_id2module_name_and_members
|
|
|
|
)
|
|
|
|
arxiv_id2file_names = search_documentation_for_arxiv_references(DOCS_DIR)
|
2024-05-20 22:30:35 +00:00
|
|
|
arxiv_id2templates = search_templates_for_arxiv_references(TEMPLATES_DIR)
|
2024-05-27 22:47:02 +00:00
|
|
|
arxiv_id2cookbooks = search_cookbooks_for_arxiv_references(COOKBOOKS_DIR)
|
2024-05-20 22:30:35 +00:00
|
|
|
arxiv_id2type2key2urls = compound_urls(
|
2024-05-27 22:47:02 +00:00
|
|
|
arxiv_id2file_names, arxiv_id2code_urls, arxiv_id2templates, arxiv_id2cookbooks
|
2024-05-20 22:30:35 +00:00
|
|
|
)
|
|
|
|
log_results(arxiv_id2type2key2urls)
|
2024-05-17 18:28:57 +00:00
|
|
|
|
|
|
|
# get the arXiv paper information
|
2024-05-20 22:30:35 +00:00
|
|
|
papers = ArxivAPIWrapper().get_papers(arxiv_id2type2key2urls)
|
2024-05-17 18:28:57 +00:00
|
|
|
|
|
|
|
# generate the arXiv references page
|
2024-05-20 22:30:35 +00:00
|
|
|
output_file = DOCS_DIR / "additional_resources" / "arxiv_references.mdx"
|
2024-05-17 18:28:57 +00:00
|
|
|
generate_arxiv_references_page(output_file, papers)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|