mirror of
https://github.com/hwchase17/langchain
synced 2024-11-02 09:40:22 +00:00
0600998f38
- **Description:** Adds Wikidata support to langchain. Can read out documents from Wikidata. - **Issue:** N/A - **Dependencies:** Adds implicit dependencies for `wikibase-rest-api-client` (for turning items into docs) and `mediawikiapi` (for hitting the search endpoint) - **Twitter handle:** @derenrich You can see an example of this tool used in a chain [here](https://nbviewer.org/urls/d.erenrich.net/upload/Wikidata_Langchain.ipynb) or [here](https://nbviewer.org/urls/d.erenrich.net/upload/Wikidata_Lars_Kai_Hansen.ipynb) <!-- Thank you for contributing to LangChain! Please make sure your PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` from the root of the package you've modified to check this locally. See contribution guidelines for more information on how to write/run tests, lint, etc: https://python.langchain.com/docs/contributing/ If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17. -->
182 lines
5.2 KiB
Python
182 lines
5.2 KiB
Python
"""Util that calls Wikidata."""
|
|
|
|
import logging
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from langchain_core.documents import Document
|
|
from langchain_core.pydantic_v1 import BaseModel, root_validator
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
WIKIDATA_MAX_QUERY_LENGTH = 300
|
|
# Common properties you probably want to see filtered from https://www.wikidata.org/wiki/Wikidata:Database_reports/List_of_properties/all
|
|
DEFAULT_PROPERTIES = [
|
|
"P31",
|
|
"P279",
|
|
"P27",
|
|
"P361",
|
|
"P527",
|
|
"P495",
|
|
"P17",
|
|
"P585",
|
|
"P131",
|
|
"P106",
|
|
"P21",
|
|
"P569",
|
|
"P570",
|
|
"P577",
|
|
"P50",
|
|
"P571",
|
|
"P641",
|
|
"P625",
|
|
"P19",
|
|
"P69",
|
|
"P108",
|
|
"P136",
|
|
"P39",
|
|
"P161",
|
|
"P20",
|
|
"P101",
|
|
"P179",
|
|
"P175",
|
|
"P7937",
|
|
"P57",
|
|
"P607",
|
|
"P509",
|
|
"P800",
|
|
"P449",
|
|
"P580",
|
|
"P582",
|
|
"P276",
|
|
"P69",
|
|
"P112",
|
|
"P740",
|
|
"P159",
|
|
"P452",
|
|
"P102",
|
|
"P1142",
|
|
"P1387",
|
|
"P1576",
|
|
"P140",
|
|
"P178",
|
|
"P287",
|
|
"P25",
|
|
"P22",
|
|
"P40",
|
|
"P185",
|
|
"P802",
|
|
"P1416",
|
|
]
|
|
DEFAULT_LANG_CODE = "en"
|
|
WIKIDATA_USER_AGENT = "langchain-wikidata"
|
|
WIKIDATA_API_URL = "https://www.wikidata.org/w/api.php"
|
|
WIKIDATA_REST_API_URL = "https://www.wikidata.org/w/rest.php/wikibase/v0/"
|
|
|
|
|
|
class WikidataAPIWrapper(BaseModel):
|
|
"""Wrapper around the Wikidata API.
|
|
|
|
To use, you should have the ``wikibase-rest-api-client`` and
|
|
``mediawikiapi `` python packages installed.
|
|
This wrapper will use the Wikibase APIs to conduct searches and
|
|
fetch item content. By default, it will return the item content
|
|
of the top-k results.
|
|
It limits the Document content by doc_content_chars_max.
|
|
"""
|
|
|
|
wikidata_mw: Any #: :meta private:
|
|
wikidata_rest: Any # : :meta private:
|
|
top_k_results: int = 2
|
|
load_all_available_meta: bool = False
|
|
doc_content_chars_max: int = 4000
|
|
wikidata_props: List[str] = DEFAULT_PROPERTIES
|
|
lang: str = DEFAULT_LANG_CODE
|
|
|
|
@root_validator()
|
|
def validate_environment(cls, values: Dict) -> Dict:
|
|
"""Validate that the python package exists in environment."""
|
|
try:
|
|
from mediawikiapi import MediaWikiAPI
|
|
from mediawikiapi.config import Config
|
|
|
|
values["wikidata_mw"] = MediaWikiAPI(
|
|
Config(user_agent=WIKIDATA_USER_AGENT, mediawiki_url=WIKIDATA_API_URL)
|
|
)
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Could not import mediawikiapi python package. "
|
|
"Please install it with `pip install mediawikiapi`."
|
|
)
|
|
|
|
try:
|
|
from wikibase_rest_api_client import Client
|
|
|
|
client = Client(
|
|
timeout=60,
|
|
base_url=WIKIDATA_REST_API_URL,
|
|
headers={"User-Agent": WIKIDATA_USER_AGENT},
|
|
follow_redirects=True,
|
|
)
|
|
values["wikidata_rest"] = client
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Could not import wikibase_rest_api_client python package. "
|
|
"Please install it with `pip install wikibase-rest-api-client`."
|
|
)
|
|
return values
|
|
|
|
def _item_to_document(self, qid: str) -> Optional[Document]:
|
|
from wikibase_rest_api_client.utilities.fluent import FluentWikibaseClient
|
|
|
|
fluent_client: FluentWikibaseClient = FluentWikibaseClient(
|
|
self.wikidata_rest, supported_props=self.wikidata_props, lang=self.lang
|
|
)
|
|
resp = fluent_client.get_item(qid)
|
|
|
|
if not resp:
|
|
logger.warning(f"Could not find item {qid} in Wikidata")
|
|
return None
|
|
|
|
doc_lines = []
|
|
if resp.label:
|
|
doc_lines.append(f"Label: {resp.label}")
|
|
if resp.description:
|
|
doc_lines.append(f"Description: {resp.description}")
|
|
if resp.aliases:
|
|
doc_lines.append(f"Aliases: {', '.join(resp.aliases)}")
|
|
for prop, values in resp.statements.items():
|
|
if values:
|
|
doc_lines.append(f"{prop.label}: {', '.join(values)}")
|
|
|
|
return Document(
|
|
page_content=("\n".join(doc_lines))[: self.doc_content_chars_max],
|
|
meta={"title": qid, "source": f"https://www.wikidata.org/wiki/{qid}"},
|
|
)
|
|
|
|
def load(self, query: str) -> List[Document]:
|
|
"""
|
|
Run Wikidata search and get the item documents plus the meta information.
|
|
"""
|
|
|
|
clipped_query = query[:WIKIDATA_MAX_QUERY_LENGTH]
|
|
items = self.wikidata_mw.search(clipped_query, results=self.top_k_results)
|
|
docs = []
|
|
for item in items[: self.top_k_results]:
|
|
if doc := self._item_to_document(item):
|
|
docs.append(doc)
|
|
return docs
|
|
|
|
def run(self, query: str) -> str:
|
|
"""Run Wikidata search and get item summaries."""
|
|
|
|
clipped_query = query[:WIKIDATA_MAX_QUERY_LENGTH]
|
|
items = self.wikidata_mw.search(clipped_query, results=self.top_k_results)
|
|
|
|
docs = []
|
|
for item in items[: self.top_k_results]:
|
|
if doc := self._item_to_document(item):
|
|
docs.append(f"Result {item}:\n{doc.page_content}")
|
|
if not docs:
|
|
return "No good Wikidata Search Result was found"
|
|
return "\n\n".join(docs)[: self.doc_content_chars_max]
|