adds doc_content_chars_max argument to WikipediaLoader (#6645)

# Description
It adds a new initialization param in `WikipediaLoader` so we can
override the `doc_content_chars_max` param used in `WikipediaAPIWrapper`
under the hood, e.g:

```python
from langchain.document_loaders import WikipediaLoader

# doc_content_chars_max is the new init param
loader = WikipediaLoader(query="python", doc_content_chars_max=90000)
```

## Decisions
`doc_content_chars_max` default value will be 4000, because it's the
current value
I have added pycode comments

# Issue
#6639

# Dependencies
None


# Twitter handle
[@elafo](https://twitter.com/elafo)
This commit is contained in:
eLafo 2023-06-24 00:22:09 +02:00 committed by GitHub
parent 5e5b30b74f
commit db8b13df4c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -18,17 +18,36 @@ class WikipediaLoader(BaseLoader):
lang: str = "en",
load_max_docs: Optional[int] = 100,
load_all_available_meta: Optional[bool] = False,
doc_content_chars_max: Optional[int] = 4000,
):
"""
Initializes a new instance of the WikipediaLoader class.
Args:
query (str): The query string to search on Wikipedia.
lang (str, optional): The language code for the Wikipedia language edition. Defaults to "en".
load_max_docs (int, optional): The maximum number of documents to load. Defaults to 100.
load_all_available_meta (bool, optional): Indicates whether to load all available metadata for each document. Defaults to False.
doc_content_chars_max (int, optional): The maximum number of characters for the document content. Defaults to 4000.
"""
self.query = query
self.lang = lang
self.load_max_docs = load_max_docs
self.load_all_available_meta = load_all_available_meta
self.doc_content_chars_max = doc_content_chars_max
def load(self) -> List[Document]:
"""
Loads the query result from Wikipedia into a list of Documents.
Returns:
List[Document]: A list of Document objects representing the loaded Wikipedia pages.
"""
client = WikipediaAPIWrapper(
lang=self.lang,
top_k_results=self.load_max_docs,
load_all_available_meta=self.load_all_available_meta,
doc_content_chars_max=self.doc_content_chars_max,
)
docs = client.load(self.query)
return docs