2023-05-24 17:40:27 +00:00
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
2023-12-11 21:53:30 +00:00
|
|
|
from langchain_community.document_loaders.readthedocs import ReadTheDocsLoader
|
2023-05-24 17:40:27 +00:00
|
|
|
|
|
|
|
PARENT_DIR = Path(__file__).parent / "test_docs" / "readthedocs"
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.requires("bs4")
|
|
|
|
def test_main_id_main_content() -> None:
|
|
|
|
loader = ReadTheDocsLoader(PARENT_DIR / "main_id_main_content")
|
|
|
|
documents = loader.load()
|
|
|
|
assert len(documents[0].page_content) != 0
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.requires("bs4")
|
|
|
|
def test_div_role_main() -> None:
|
|
|
|
loader = ReadTheDocsLoader(PARENT_DIR / "div_role_main")
|
|
|
|
documents = loader.load()
|
|
|
|
assert len(documents[0].page_content) != 0
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.requires("bs4")
|
|
|
|
def test_custom() -> None:
|
|
|
|
loader = ReadTheDocsLoader(
|
|
|
|
PARENT_DIR / "custom",
|
|
|
|
custom_html_tag=("article", {"role": "main"}),
|
|
|
|
)
|
|
|
|
documents = loader.load()
|
|
|
|
assert len(documents[0].page_content) != 0
|
|
|
|
|
|
|
|
|
More comprehensive readthedocs document loader (#12382)
## **Description:**
When building our own readthedocs.io scraper, we noticed a couple
interesting things:
1. Text lines with a lot of nested <span> tags would give unclean text
with a bunch of newlines. For example, for [Langchain's
documentation](https://api.python.langchain.com/en/latest/document_loaders/langchain.document_loaders.readthedocs.ReadTheDocsLoader.html#langchain.document_loaders.readthedocs.ReadTheDocsLoader),
a single line is represented in a complicated nested HTML structure, and
the naive `soup.get_text()` call currently being made will create a
newline for each nested HTML element. Therefore, the document loader
would give a messy, newline-separated blob of text. This would be true
in a lot of cases.
<img width="945" alt="Screenshot 2023-10-26 at 6 15 39 PM"
src="https://github.com/langchain-ai/langchain/assets/44193474/eca85d1f-d2bf-4487-a18a-e1e732fadf19">
<img width="1031" alt="Screenshot 2023-10-26 at 6 16 00 PM"
src="https://github.com/langchain-ai/langchain/assets/44193474/035938a0-9892-4f6a-83cd-0d7b409b00a3">
Additionally, content from iframes, code from scripts, css from styles,
etc. will be gotten if it's a subclass of the selector (which happens
more often than you'd think). For example, [this
page](https://pydeck.gl/gallery/contour_layer.html#) will scrape 1.5
million characters of content that looks like this:
<img width="1372" alt="Screenshot 2023-10-26 at 6 32 55 PM"
src="https://github.com/langchain-ai/langchain/assets/44193474/dbd89e39-9478-4a18-9e84-f0eb91954eac">
Therefore, I wrote a recursive _get_clean_text(soup) class function that
1. skips all irrelevant elements, and 2. only adds newlines when
necessary.
2. Index pages (like [this
one](https://api.python.langchain.com/en/latest/api_reference.html))
would be loaded, chunked, and eventually embedded. This is really bad
not just because the user will be embedding irrelevant information - but
because index pages are very likely to show up in retrieved content,
making retrieval less effective (in our tests). Therefore, I added a
bool parameter `exclude_index_pages` defaulted to False (which is the
current behavior — although I'd petition to default this to True) that
will skip all pages where links take up 50%+ of the page. Through manual
testing, this seems to be the best threshold.
## Other Information:
- **Issue:** n/a
- **Dependencies:** n/a
- **Tag maintainer:** n/a
- **Twitter handle:** @andrewthezhou
---------
Co-authored-by: Andrew Zhou <andrew@heykona.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
2023-10-29 23:26:53 +00:00
|
|
|
@pytest.mark.requires("bs4")
|
|
|
|
def test_nested_html_structure() -> None:
|
|
|
|
loader = ReadTheDocsLoader(PARENT_DIR / "nested_html_structure")
|
|
|
|
documents = loader.load()
|
|
|
|
assert documents[0].page_content == "Hello World!"
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.requires("bs4")
|
|
|
|
def test_index_page() -> None:
|
|
|
|
loader = ReadTheDocsLoader(PARENT_DIR / "index_page", exclude_links_ratio=0.5)
|
|
|
|
documents = loader.load()
|
|
|
|
assert len(documents[0].page_content) == 0
|
|
|
|
|
|
|
|
|
2023-05-24 17:40:27 +00:00
|
|
|
@pytest.mark.requires("bs4")
|
|
|
|
def test_empty() -> None:
|
|
|
|
loader = ReadTheDocsLoader(
|
|
|
|
PARENT_DIR / "custom",
|
|
|
|
)
|
|
|
|
documents = loader.load()
|
|
|
|
assert len(documents[0].page_content) == 0
|