Harrison/headers (#1696)

Co-authored-by: Tim Asp <707699+timothyasp@users.noreply.github.com>
This commit is contained in:
Harrison Chase 2023-03-15 13:13:21 -07:00 committed by GitHub
parent 3ea6d9c4d2
commit aad4bff098
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,24 +1,50 @@
"""Web base loader class.""" """Web base loader class."""
from typing import Any, List import logging
from typing import Any, List, Optional
import requests import requests
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
logger = logging.getLogger(__file__)
default_header_template = {
"User-Agent": "",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*"
";q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Referer": "https://www.google.com/",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
class WebBaseLoader(BaseLoader): class WebBaseLoader(BaseLoader):
"""Loader that uses urllib and beautiful soup to load webpages.""" """Loader that uses urllib and beautiful soup to load webpages."""
def __init__(self, web_path: str): def __init__(self, web_path: str, header_template: Optional[dict] = None):
"""Initialize with webpage path.""" """Initialize with webpage path."""
self.web_path = web_path self.web_path = web_path
self.session = requests.Session()
@staticmethod try:
def _scrape(url: str) -> Any: from fake_useragent import UserAgent
headers = header_template or default_header_template
headers["User-Agent"] = UserAgent().random
self.session.headers = dict(headers)
except ImportError:
logger.info(
"fake_useragent not found, using default user agent."
"To get a realistic header for requests, `pip install fake_useragent`."
)
def _scrape(self, url: str) -> Any:
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
html_doc = requests.get(url) html_doc = self.session.get(url)
soup = BeautifulSoup(html_doc.text, "html.parser") soup = BeautifulSoup(html_doc.text, "html.parser")
return soup return soup