fix recursive loader (#10856)

pull/10858/head
Bagatur 1 year ago committed by GitHub
parent de0a02f507
commit b05a74b106
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -120,6 +120,7 @@ class RecursiveUrlLoader(BaseLoader):
return
# Get all links that can be accessed from the current URL
visited.add(url)
try:
response = requests.get(url, timeout=self.timeout, headers=self.headers)
except Exception:
@ -131,7 +132,6 @@ class RecursiveUrlLoader(BaseLoader):
page_content=content,
metadata=self.metadata_extractor(response.text, url),
)
visited.add(url)
# Store the visited links and recursively visit the children
sub_links = extract_sub_links(
@ -184,11 +184,11 @@ class RecursiveUrlLoader(BaseLoader):
timeout=aiohttp.ClientTimeout(total=self.timeout),
headers=self.headers,
)
async with self._lock: # type: ignore
visited.add(url)
try:
async with session.get(url) as response:
text = await response.text()
async with self._lock: # type: ignore
visited.add(url)
except (aiohttp.client_exceptions.InvalidURL, Exception) as e:
logger.warning(
f"Unable to load {url}. Received error {e} of type "

@ -14,7 +14,7 @@ def test_async_recursive_url_loader() -> None:
timeout=None,
)
docs = loader.load()
assert len(docs) == 1024
assert len(docs) == 890
assert docs[0].page_content == "placeholder"
@ -38,7 +38,7 @@ def test_sync_recursive_url_loader() -> None:
url, extractor=lambda _: "placeholder", use_async=False, max_depth=2
)
docs = loader.load()
assert len(docs) == 27
assert len(docs) == 25
assert docs[0].page_content == "placeholder"

Loading…
Cancel
Save