mirror of https://github.com/hwchase17/langchain
Added new use case docs for Web Scraping, Chromium loader, BS4 transformer (#8732)
- Description: Added a new use case category called "Web Scraping", and a tutorial to scrape websites using OpenAI Functions Extraction chain to the docs. - Tag maintainer:@baskaryan @hwchase17 , - Twitter handle: https://www.linkedin.com/in/haiphunghiem/ (I'm on LinkedIn mostly) --------- Co-authored-by: Lance Martin <lance@langchain.dev>harrison/clean-up-imports
parent
6cb763507c
commit
e4418d1b7e
@ -0,0 +1,9 @@
|
||||
---
|
||||
sidebar_position: 3
|
||||
---
|
||||
|
||||
# Web Scraping
|
||||
|
||||
Web scraping has historically been a challenging endeavor due to the ever-changing nature of website structures, making it tedious for developers to maintain their scraping scripts. Traditional methods often rely on specific HTML tags and patterns which, when altered, can disrupt data extraction processes.
|
||||
|
||||
Enter the LLM-based method for parsing HTML: By leveraging the capabilities of LLMs, and especially OpenAI Functions in LangChain's extraction chain, developers can instruct the model to extract only the desired data in a specified format. This method not only streamlines the extraction process but also significantly reduces the time spent on manual debugging and script modifications. Its adaptability means that even if websites undergo significant design changes, the extraction remains consistent and robust. This level of resilience translates to reduced maintenance efforts, cost savings, and ensures a higher quality of extracted data. Compared to its predecessors, LLM-based approach wins out the web scraping domain by transforming a historically cumbersome task into a more automated and efficient process.
|
Binary file not shown.
After Width: | Height: | Size: 152 KiB |
Binary file not shown.
After Width: | Height: | Size: 172 KiB |
Binary file not shown.
After Width: | Height: | Size: 716 KiB |
@ -0,0 +1,90 @@
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Iterator, List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AsyncChromiumLoader(BaseLoader):
|
||||
"""Scrape HTML content from provided URLs using a
|
||||
headless instance of the Chromium browser."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
urls: List[str],
|
||||
):
|
||||
"""
|
||||
Initialize the loader with a list of URL paths.
|
||||
|
||||
Args:
|
||||
urls (List[str]): A list of URLs to scrape content from.
|
||||
|
||||
Raises:
|
||||
ImportError: If the required 'playwright' package is not installed.
|
||||
"""
|
||||
self.urls = urls
|
||||
|
||||
try:
|
||||
import playwright # noqa: F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"playwright is required for AsyncChromiumLoader. "
|
||||
"Please install it with `pip install playwright`."
|
||||
)
|
||||
|
||||
async def ascrape_playwright(self, url: str) -> str:
|
||||
"""
|
||||
Asynchronously scrape the content of a given URL using Playwright's async API.
|
||||
|
||||
Args:
|
||||
url (str): The URL to scrape.
|
||||
|
||||
Returns:
|
||||
str: The scraped HTML content or an error message if an exception occurs.
|
||||
|
||||
"""
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
logger.info("Starting scraping...")
|
||||
results = ""
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
try:
|
||||
page = await browser.new_page()
|
||||
await page.goto(url)
|
||||
results = await page.content() # Simply get the HTML content
|
||||
logger.info("Content scraped")
|
||||
except Exception as e:
|
||||
results = f"Error: {e}"
|
||||
await browser.close()
|
||||
return results
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""
|
||||
Lazily load text content from the provided URLs.
|
||||
|
||||
This method yields Documents one at a time as they're scraped,
|
||||
instead of waiting to scrape all URLs before returning.
|
||||
|
||||
Yields:
|
||||
Document: The scraped content encapsulated within a Document object.
|
||||
|
||||
"""
|
||||
for url in self.urls:
|
||||
html_content = asyncio.run(self.ascrape_playwright(url))
|
||||
metadata = {"source": url}
|
||||
yield Document(page_content=html_content, metadata=metadata)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""
|
||||
Load and return all Documents from the provided URLs.
|
||||
|
||||
Returns:
|
||||
List[Document]: A list of Document objects
|
||||
containing the scraped content from each URL.
|
||||
|
||||
"""
|
||||
return list(self.lazy_load())
|
@ -0,0 +1,143 @@
|
||||
from typing import Any, List, Sequence
|
||||
|
||||
from langchain.schema import BaseDocumentTransformer, Document
|
||||
|
||||
|
||||
class BeautifulSoupTransformer(BaseDocumentTransformer):
|
||||
"""Transform HTML content by extracting specific tags and removing unwanted ones.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
from langchain.document_transformers import BeautifulSoupTransformer
|
||||
bs4_transformer = BeautifulSoupTransformer()
|
||||
docs_transformed = bs4_transformer.transform_documents(docs)
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""
|
||||
Initialize the transformer.
|
||||
|
||||
This checks if the BeautifulSoup4 package is installed.
|
||||
If not, it raises an ImportError.
|
||||
"""
|
||||
try:
|
||||
import bs4 # noqa:F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"BeautifulSoup4 is required for BeautifulSoupTransformer. "
|
||||
"Please install it with `pip install beautifulsoup4`."
|
||||
)
|
||||
|
||||
def transform_documents(
|
||||
self,
|
||||
documents: Sequence[Document],
|
||||
unwanted_tags: List[str] = ["script", "style"],
|
||||
tags_to_extract: List[str] = ["p", "li", "div", "a"],
|
||||
remove_lines: bool = True,
|
||||
**kwargs: Any,
|
||||
) -> Sequence[Document]:
|
||||
"""
|
||||
Transform a list of Document objects by cleaning their HTML content.
|
||||
|
||||
Args:
|
||||
documents: A sequence of Document objects containing HTML content.
|
||||
unwanted_tags: A list of tags to be removed from the HTML.
|
||||
tags_to_extract: A list of tags whose content will be extracted.
|
||||
remove_lines: If set to True, unnecessary lines will be
|
||||
removed from the HTML content.
|
||||
|
||||
Returns:
|
||||
A sequence of Document objects with transformed content.
|
||||
"""
|
||||
for doc in documents:
|
||||
cleaned_content = doc.page_content
|
||||
|
||||
cleaned_content = self.remove_unwanted_tags(cleaned_content, unwanted_tags)
|
||||
|
||||
cleaned_content = self.extract_tags(cleaned_content, tags_to_extract)
|
||||
|
||||
if remove_lines:
|
||||
cleaned_content = self.remove_unnecessary_lines(cleaned_content)
|
||||
|
||||
doc.page_content = cleaned_content
|
||||
|
||||
return documents
|
||||
|
||||
@staticmethod
|
||||
def remove_unwanted_tags(html_content: str, unwanted_tags: List[str]) -> str:
|
||||
"""
|
||||
Remove unwanted tags from a given HTML content.
|
||||
|
||||
Args:
|
||||
html_content: The original HTML content string.
|
||||
unwanted_tags: A list of tags to be removed from the HTML.
|
||||
|
||||
Returns:
|
||||
A cleaned HTML string with unwanted tags removed.
|
||||
"""
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
for tag in unwanted_tags:
|
||||
for element in soup.find_all(tag):
|
||||
element.decompose()
|
||||
return str(soup)
|
||||
|
||||
@staticmethod
|
||||
def extract_tags(html_content: str, tags: List[str]) -> str:
|
||||
"""
|
||||
Extract specific tags from a given HTML content.
|
||||
|
||||
Args:
|
||||
html_content: The original HTML content string.
|
||||
tags: A list of tags to be extracted from the HTML.
|
||||
|
||||
Returns:
|
||||
A string combining the content of the extracted tags.
|
||||
"""
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
text_parts = []
|
||||
for tag in tags:
|
||||
elements = soup.find_all(tag)
|
||||
for element in elements:
|
||||
if tag == "a":
|
||||
href = element.get("href")
|
||||
if href:
|
||||
text_parts.append(f"{element.get_text()} ({href})")
|
||||
else:
|
||||
text_parts.append(element.get_text())
|
||||
else:
|
||||
text_parts.append(element.get_text())
|
||||
return " ".join(text_parts)
|
||||
|
||||
@staticmethod
|
||||
def remove_unnecessary_lines(content: str) -> str:
|
||||
"""
|
||||
Clean up the content by removing unnecessary lines.
|
||||
|
||||
Args:
|
||||
content: A string, which may contain unnecessary lines or spaces.
|
||||
|
||||
Returns:
|
||||
A cleaned string with unnecessary lines removed.
|
||||
"""
|
||||
lines = content.split("\n")
|
||||
stripped_lines = [line.strip() for line in lines]
|
||||
non_empty_lines = [line for line in stripped_lines if line]
|
||||
seen = set()
|
||||
deduped_lines = []
|
||||
for line in non_empty_lines:
|
||||
if line not in seen:
|
||||
seen.add(line)
|
||||
deduped_lines.append(line)
|
||||
cleaned_content = " ".join(deduped_lines)
|
||||
return cleaned_content
|
||||
|
||||
async def atransform_documents(
|
||||
self,
|
||||
documents: Sequence[Document],
|
||||
**kwargs: Any,
|
||||
) -> Sequence[Document]:
|
||||
raise NotImplementedError
|
Loading…
Reference in New Issue