mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Add concurrency to GitbookLoader (#7069)
- Description: Fetch all pages concurrently. - Dependencies: `scrape_all` -> `fetch_all` -> `_fetch_with_rate_limit` -> `_fetch` (might be broken currently: https://github.com/hwchase17/langchain/pull/6519) - Tag maintainer: @rlancemartin, @eyurtsev --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
parent
6aa66fd2b0
commit
930e319ca7
@ -49,17 +49,18 @@ class GitbookLoader(WebBaseLoader):
|
||||
if self.load_all_paths:
|
||||
soup_info = self.scrape()
|
||||
relative_paths = self._get_paths(soup_info)
|
||||
documents = []
|
||||
for path in relative_paths:
|
||||
url = urljoin(self.base_url, path)
|
||||
print(f"Fetching text from {url}")
|
||||
soup_info = self._scrape(url)
|
||||
documents.append(self._get_document(soup_info, url))
|
||||
return [d for d in documents if d]
|
||||
urls = [urljoin(self.base_url, path) for path in relative_paths]
|
||||
soup_infos = self.scrape_all(urls)
|
||||
_documents = [
|
||||
self._get_document(soup_info, url)
|
||||
for soup_info, url in zip(soup_infos, urls)
|
||||
]
|
||||
else:
|
||||
soup_info = self.scrape()
|
||||
documents = [self._get_document(soup_info, self.web_path)]
|
||||
return [d for d in documents if d]
|
||||
_documents = [self._get_document(soup_info, self.web_path)]
|
||||
documents = [d for d in _documents if d]
|
||||
|
||||
return documents
|
||||
|
||||
def _get_document(
|
||||
self, soup: Any, custom_url: Optional[str] = None
|
||||
|
Loading…
Reference in New Issue
Block a user