community: add flag to toggle progress bar (#24463)

- **Description:** Add a flag to determine whether to show progress bar 
- **Issue:** n/a
- **Dependencies:** n/a
- **Twitter handle:** n/a

---------

Co-authored-by: Chester Curme <chester.curme@gmail.com>
This commit is contained in:
clement.l 2024-07-20 21:18:02 +08:00 committed by GitHub
parent 6b08a33fa4
commit d98b830e4b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 23 additions and 6 deletions

View File

@ -41,6 +41,7 @@ class BlackboardLoader(WebBaseLoader):
basic_auth: Optional[Tuple[str, str]] = None,
cookies: Optional[dict] = None,
continue_on_failure: bool = False,
show_progress: bool = True,
):
"""Initialize with blackboard course url.
@ -56,12 +57,15 @@ class BlackboardLoader(WebBaseLoader):
occurs loading a url, emitting a warning instead of raising an
exception. Setting this to True makes the loader more robust, but also
may result in missing data. Default: False
show_progress: whether to show a progress bar while loading. Default: True
Raises:
ValueError: If blackboard course url is invalid.
"""
super().__init__(
web_paths=(blackboard_course_url), continue_on_failure=continue_on_failure
web_paths=(blackboard_course_url),
continue_on_failure=continue_on_failure,
show_progress=show_progress,
)
# Get base url
try:

View File

@ -20,6 +20,7 @@ class GitbookLoader(WebBaseLoader):
base_url: Optional[str] = None,
content_selector: str = "main",
continue_on_failure: bool = False,
show_progress: bool = True,
):
"""Initialize with web page and whether to load all paths.
@ -36,6 +37,7 @@ class GitbookLoader(WebBaseLoader):
occurs loading a url, emitting a warning instead of raising an
exception. Setting this to True makes the loader more robust, but also
may result in missing data. Default: False
show_progress: whether to show a progress bar while loading. Default: True
"""
self.base_url = base_url or web_page
if self.base_url.endswith("/"):
@ -43,7 +45,11 @@ class GitbookLoader(WebBaseLoader):
if load_all_paths:
# set web_path to the sitemap if we want to crawl all paths
web_page = f"{self.base_url}/sitemap.xml"
super().__init__(web_paths=(web_page,), continue_on_failure=continue_on_failure)
super().__init__(
web_paths=(web_page,),
continue_on_failure=continue_on_failure,
show_progress=show_progress,
)
self.load_all_paths = load_all_paths
self.content_selector = content_selector

View File

@ -58,6 +58,8 @@ class WebBaseLoader(BaseLoader):
bs_get_text_kwargs: Optional[Dict[str, Any]] = None,
bs_kwargs: Optional[Dict[str, Any]] = None,
session: Any = None,
*,
show_progress: bool = True,
) -> None:
"""Initialize loader.
@ -69,6 +71,7 @@ class WebBaseLoader(BaseLoader):
raise_for_status: Raise an exception if http status code denotes an error.
bs_get_text_kwargs: kwargs for beatifulsoup4 get_text
bs_kwargs: kwargs for beatifulsoup4 web page parsing
show_progress: Show progress bar when loading pages.
"""
# web_path kept for backwards-compatibility.
if web_path and web_paths:
@ -91,6 +94,7 @@ class WebBaseLoader(BaseLoader):
self.default_parser = default_parser
self.requests_kwargs = requests_kwargs or {}
self.raise_for_status = raise_for_status
self.show_progress = show_progress
self.bs_get_text_kwargs = bs_get_text_kwargs or {}
self.bs_kwargs = bs_kwargs or {}
if session:
@ -177,11 +181,14 @@ class WebBaseLoader(BaseLoader):
task = asyncio.ensure_future(self._fetch_with_rate_limit(url, semaphore))
tasks.append(task)
try:
from tqdm.asyncio import tqdm_asyncio
if self.show_progress:
from tqdm.asyncio import tqdm_asyncio
return await tqdm_asyncio.gather(
*tasks, desc="Fetching pages", ascii=True, mininterval=1
)
return await tqdm_asyncio.gather(
*tasks, desc="Fetching pages", ascii=True, mininterval=1
)
else:
return await asyncio.gather(*tasks)
except ImportError:
warnings.warn("For better logging of progress, `pip install tqdm`")
return await asyncio.gather(*tasks)