From 53e86205adf3d42105575c4f7561bbb551db6ad9 Mon Sep 17 00:00:00 2001 From: Siddhant Rai Date: Fri, 3 May 2024 18:47:30 +0530 Subject: [PATCH] fix: added more headers from default --- application/parser/remote/web_loader.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/application/parser/remote/web_loader.py b/application/parser/remote/web_loader.py index 98564a0..a19e0c9 100644 --- a/application/parser/remote/web_loader.py +++ b/application/parser/remote/web_loader.py @@ -1,6 +1,17 @@ from application.parser.remote.base import BaseRemote from langchain_community.document_loaders import WebBaseLoader +headers = { + "User-Agent": "Mozilla/5.0", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*" + ";q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Referer": "https://www.google.com/", + "DNT": "1", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", +} + class WebLoader(BaseRemote): def __init__(self): @@ -13,9 +24,7 @@ class WebLoader(BaseRemote): documents = [] for url in urls: try: - loader = self.loader( - [url], header_template={"User-Agent": "Mozilla/5.0"} - ) + loader = self.loader([url], header_template=headers) documents.extend(loader.load()) except Exception as e: print(f"Error processing URL {url}: {e}")