Add concurrency to GitbookLoader (#7069)

- Description: Fetch all pages concurrently.
- Dependencies: `scrape_all` -> `fetch_all` -> `_fetch_with_rate_limit`
-> `_fetch` (might be broken currently:
https://github.com/hwchase17/langchain/pull/6519)
  - Tag maintainer: @rlancemartin, @eyurtsev

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
Yevgnen 2023-07-06 08:51:10 +08:00 committed by GitHub
parent 6aa66fd2b0
commit 930e319ca7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -49,17 +49,18 @@ class GitbookLoader(WebBaseLoader):
if self.load_all_paths:
soup_info = self.scrape()
relative_paths = self._get_paths(soup_info)
documents = []
for path in relative_paths:
url = urljoin(self.base_url, path)
print(f"Fetching text from {url}")
soup_info = self._scrape(url)
documents.append(self._get_document(soup_info, url))
return [d for d in documents if d]
urls = [urljoin(self.base_url, path) for path in relative_paths]
soup_infos = self.scrape_all(urls)
_documents = [
self._get_document(soup_info, url)
for soup_info, url in zip(soup_infos, urls)
]
else:
soup_info = self.scrape()
documents = [self._get_document(soup_info, self.web_path)]
return [d for d in documents if d]
_documents = [self._get_document(soup_info, self.web_path)]
documents = [d for d in _documents if d]
return documents
def _get_document(
self, soup: Any, custom_url: Optional[str] = None