From 8f2ad38503603a5da686bedc94b79193d91e7764 Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 11 Oct 2023 10:13:51 +0100 Subject: [PATCH 01/17] tests --- application/parser/remote/base.py | 19 +++++++++++++++++++ application/parser/remote/telegram.py | 11 +++++++++++ 2 files changed, 30 insertions(+) create mode 100644 application/parser/remote/base.py create mode 100644 application/parser/remote/telegram.py diff --git a/application/parser/remote/base.py b/application/parser/remote/base.py new file mode 100644 index 00000000..91313f22 --- /dev/null +++ b/application/parser/remote/base.py @@ -0,0 +1,19 @@ +"""Base reader class.""" +from abc import abstractmethod +from typing import Any, List + +from langchain.docstore.document import Document as LCDocument +from application.parser.schema.base import Document + + +class BaseRemote: + """Utilities for loading data from a directory.""" + + @abstractmethod + def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]: + """Load data from the input directory.""" + + def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]: + """Load data in LangChain document format.""" + docs = self.load_data(**load_kwargs) + return [d.to_langchain_format() for d in docs] diff --git a/application/parser/remote/telegram.py b/application/parser/remote/telegram.py new file mode 100644 index 00000000..895d5cb3 --- /dev/null +++ b/application/parser/remote/telegram.py @@ -0,0 +1,11 @@ +from langchain.document_loader import TelegramChatApiLoader, TelegramChatFileLoader +from application.parser.remote.base import BaseRemote + +class TelegramChatApiRemote(BaseRemote): + def _init_parser(self, *args, **load_kwargs): + self.loader = TelegramChatApiLoader(**load_kwargs) + return {} + + def parse_file(self, *args, **load_kwargs): + + return text \ No newline at end of file From 658867cb46e253b8ae8b128f81ea5e50d999d613 Mon Sep 17 00:00:00 2001 From: Pavel Date: Thu, 12 Oct 2023 01:03:40 +0400 Subject: [PATCH 02/17] No crawler, no sitemap --- application/parser/remote/base.py | 2 +- application/parser/remote/crawler_loader.py | 0 application/parser/remote/github_loader.py | 0 application/parser/remote/remote_creator.py | 18 ++++++++ application/parser/remote/sitemap_loader.py | 0 application/parser/remote/web_loader.py | 10 +++++ application/worker.py | 47 +++++++++++++++++++++ 7 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 application/parser/remote/crawler_loader.py create mode 100644 application/parser/remote/github_loader.py create mode 100644 application/parser/remote/remote_creator.py create mode 100644 application/parser/remote/sitemap_loader.py create mode 100644 application/parser/remote/web_loader.py diff --git a/application/parser/remote/base.py b/application/parser/remote/base.py index 91313f22..75ae34d5 100644 --- a/application/parser/remote/base.py +++ b/application/parser/remote/base.py @@ -1,6 +1,6 @@ """Base reader class.""" from abc import abstractmethod -from typing import Any, List +from typing import Any, List, Iterator from langchain.docstore.document import Document as LCDocument from application.parser.schema.base import Document diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py new file mode 100644 index 00000000..e69de29b diff --git a/application/parser/remote/github_loader.py b/application/parser/remote/github_loader.py new file mode 100644 index 00000000..e69de29b diff --git a/application/parser/remote/remote_creator.py b/application/parser/remote/remote_creator.py new file mode 100644 index 00000000..e12b7a02 --- /dev/null +++ b/application/parser/remote/remote_creator.py @@ -0,0 +1,18 @@ +# from sitemap_loader import SitemapLoader +# from crawler_loader import CrawlerLoader +from application.parser.remote.web_loader import WebLoader + + +class RemoteCreator: + loaders = { + 'url': WebLoader, + # 'sitemap': SitemapLoader, + # 'crawler': CrawlerLoader + } + + @classmethod + def create_loader(cls, type, *args, **kwargs): + loader_class = cls.loaders.get(type.lower()) + if not loader_class: + raise ValueError(f"No LLM class found for type {type}") + return loader_class(*args, **kwargs) \ No newline at end of file diff --git a/application/parser/remote/sitemap_loader.py b/application/parser/remote/sitemap_loader.py new file mode 100644 index 00000000..e69de29b diff --git a/application/parser/remote/web_loader.py b/application/parser/remote/web_loader.py new file mode 100644 index 00000000..ad2847e2 --- /dev/null +++ b/application/parser/remote/web_loader.py @@ -0,0 +1,10 @@ +from application.parser.remote.base import BaseRemote + +class WebLoader(BaseRemote): + def __init__(self): + from langchain.document_loaders import WebBaseLoader + self.loader = WebBaseLoader + + def load_data(self, urls): + loader = self.loader(urls) + return loader.load() \ No newline at end of file diff --git a/application/worker.py b/application/worker.py index 71fcd615..fe4e2615 100644 --- a/application/worker.py +++ b/application/worker.py @@ -9,6 +9,7 @@ import requests from application.core.settings import settings from application.parser.file.bulk import SimpleDirectoryReader +from application.parser.remote.remote_creator import RemoteCreator from application.parser.open_ai_func import call_openai_api from application.parser.schema.base import Document from application.parser.token_func import group_split @@ -104,3 +105,49 @@ def ingest_worker(self, directory, formats, name_job, filename, user): 'user': user, 'limited': False } + +def remote_worker(self, urls, name_job, user, directory = 'temp', loader = 'url'): + sample = False + token_check = True + min_tokens = 150 + max_tokens = 1250 + full_path = directory + '/' + user + '/' + name_job + + if not os.path.exists(full_path): + os.makedirs(full_path) + + self.update_state(state='PROGRESS', meta={'current': 1}) + + # Use RemoteCreator to load data from URL + remote_loader = RemoteCreator.create_loader(loader, urls) + raw_docs = remote_loader.load_data() + + raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check) + + docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + + call_openai_api(docs, full_path, self) + self.update_state(state='PROGRESS', meta={'current': 100}) + + if sample: + for i in range(min(5, len(raw_docs))): + print(raw_docs[i].text) + + # Proceed with uploading and cleaning as in the original function + file_data = {'name': name_job, 'user': user} + if settings.VECTOR_STORE == "faiss": + files = {'file_faiss': open(full_path + '/index.faiss', 'rb'), + 'file_pkl': open(full_path + '/index.pkl', 'rb')} + response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data) + response = requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path)) + else: + response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data) + + shutil.rmtree(full_path) + + return { + 'urls': urls, + 'name_job': name_job, + 'user': user, + 'limited': False + } \ No newline at end of file From c517bdd2e162d4a17c31143b2c352a7055b91a36 Mon Sep 17 00:00:00 2001 From: Pavel Date: Thu, 12 Oct 2023 12:35:26 +0400 Subject: [PATCH 03/17] Crawler + sitemap --- application/parser/remote/crawler_loader.py | 36 +++++++++++++++++++++ application/parser/remote/sitemap_loader.py | 27 ++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py index e69de29b..208195cb 100644 --- a/application/parser/remote/crawler_loader.py +++ b/application/parser/remote/crawler_loader.py @@ -0,0 +1,36 @@ +import requests +from urllib.parse import urlparse, urljoin +from bs4 import BeautifulSoup +from application.parser.remote.base import BaseRemote + +class CrawlerLoader(BaseRemote): + def __init__(self): + from langchain.document_loaders import WebBaseLoader + self.loader = WebBaseLoader + + def load_data(self, url): + # Fetch the content of the initial URL + response = requests.get(url) + if response.status_code != 200: + print(f"Failed to fetch initial URL: {url}") + return None + + # Parse the HTML content + soup = BeautifulSoup(response.text, 'html.parser') + + # Extract the base URL to ensure we only fetch URLs from the same domain + base_url = urlparse(url).scheme + "://" + urlparse(url).hostname + + # Extract all links from the HTML content + all_links = [a['href'] for a in soup.find_all('a', href=True)] + + # Filter out the links that lead to a different domain + same_domain_links = [urljoin(base_url, link) for link in all_links if base_url in urljoin(base_url, link)] + + # Remove duplicates + same_domain_links = list(set(same_domain_links)) + + #TODO: Optimize this section to parse pages as they are being crawled + loaded_content = self.loader(same_domain_links).load() + + return loaded_content diff --git a/application/parser/remote/sitemap_loader.py b/application/parser/remote/sitemap_loader.py index e69de29b..366d81ed 100644 --- a/application/parser/remote/sitemap_loader.py +++ b/application/parser/remote/sitemap_loader.py @@ -0,0 +1,27 @@ +import requests +import xml.etree.ElementTree as ET +from application.parser.remote.base import BaseRemote + +class SitemapLoader(BaseRemote): + def __init__(self): + from langchain.document_loaders import WebBaseLoader + self.loader = WebBaseLoader + + def load_data(self, sitemap_url): + # Fetch the sitemap content + response = requests.get(sitemap_url) + if response.status_code != 200: + print(f"Failed to fetch sitemap: {sitemap_url}") + return None + + # Parse the sitemap XML + root = ET.fromstring(response.content) + + # Extract URLs from the sitemap + # The namespace with "loc" tag might be needed to extract URLs + ns = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'} + urls = [loc.text for loc in root.findall('s:url/s:loc', ns)] + + # Use your existing loader to load content of extracted URLs + loader = self.loader(urls) + return loader.load() \ No newline at end of file From 50f07f9ef5f64292ba6e9f6e7b9cc2b2bafc60c0 Mon Sep 17 00:00:00 2001 From: Pavel Date: Thu, 12 Oct 2023 12:53:33 +0400 Subject: [PATCH 04/17] limit crawler --- application/parser/remote/crawler_loader.py | 49 ++++++++++++++------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py index 208195cb..9acb8f39 100644 --- a/application/parser/remote/crawler_loader.py +++ b/application/parser/remote/crawler_loader.py @@ -4,33 +4,48 @@ from bs4 import BeautifulSoup from application.parser.remote.base import BaseRemote class CrawlerLoader(BaseRemote): - def __init__(self): + def __init__(self, limit=10): from langchain.document_loaders import WebBaseLoader self.loader = WebBaseLoader + self.limit = limit def load_data(self, url): - # Fetch the content of the initial URL - response = requests.get(url) - if response.status_code != 200: - print(f"Failed to fetch initial URL: {url}") - return None - - # Parse the HTML content - soup = BeautifulSoup(response.text, 'html.parser') + # Create a set to store visited URLs to avoid revisiting the same page + visited_urls = set() # Extract the base URL to ensure we only fetch URLs from the same domain base_url = urlparse(url).scheme + "://" + urlparse(url).hostname - # Extract all links from the HTML content - all_links = [a['href'] for a in soup.find_all('a', href=True)] + # Initialize a list with the initial URL + urls_to_visit = [url] - # Filter out the links that lead to a different domain - same_domain_links = [urljoin(base_url, link) for link in all_links if base_url in urljoin(base_url, link)] + while urls_to_visit: + current_url = urls_to_visit.pop(0) + visited_urls.add(current_url) - # Remove duplicates - same_domain_links = list(set(same_domain_links)) + # Fetch the content of the current URL + response = requests.get(current_url) + if response.status_code != 200: + print(f"Failed to fetch URL: {current_url}") + continue + + # Parse the HTML content + soup = BeautifulSoup(response.text, 'html.parser') + + # Extract all links from the HTML content + all_links = [urljoin(current_url, a['href']) for a in soup.find_all('a', href=True) if base_url in urljoin(current_url, a['href'])] + + # Add the new links to the urls_to_visit list if they haven't been visited yet + urls_to_visit.extend([link for link in all_links if link not in visited_urls]) + + # Remove duplicates + urls_to_visit = list(set(urls_to_visit)) + + # Stop if the limit is reached + if self.limit is not None and len(visited_urls) >= self.limit: + break #TODO: Optimize this section to parse pages as they are being crawled - loaded_content = self.loader(same_domain_links).load() + loaded_content = self.loader(list(visited_urls)).load() - return loaded_content + return loaded_content \ No newline at end of file From 2cfb416fd053a5e32c508ee31aead6b055b3aedf Mon Sep 17 00:00:00 2001 From: Pavel Date: Thu, 12 Oct 2023 13:44:32 +0400 Subject: [PATCH 05/17] Desc loader --- application/parser/remote/crawler_loader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py index 9acb8f39..2364dc27 100644 --- a/application/parser/remote/crawler_loader.py +++ b/application/parser/remote/crawler_loader.py @@ -7,6 +7,7 @@ class CrawlerLoader(BaseRemote): def __init__(self, limit=10): from langchain.document_loaders import WebBaseLoader self.loader = WebBaseLoader + #No pages scraped limit, set None for no limit self.limit = limit def load_data(self, url): From 719ca63ec126e270334b359a54c217e995dcca31 Mon Sep 17 00:00:00 2001 From: Pavel Date: Thu, 12 Oct 2023 19:40:23 +0400 Subject: [PATCH 06/17] fixes --- application/parser/remote/crawler_loader.py | 62 ++++++++++-------- application/parser/remote/remote_creator.py | 8 +-- application/parser/remote/sitemap_loader.py | 71 +++++++++++++++++---- application/parser/remote/web_loader.py | 14 +++- 4 files changed, 109 insertions(+), 46 deletions(-) diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py index 2364dc27..380a25bf 100644 --- a/application/parser/remote/crawler_loader.py +++ b/application/parser/remote/crawler_loader.py @@ -6,47 +6,53 @@ from application.parser.remote.base import BaseRemote class CrawlerLoader(BaseRemote): def __init__(self, limit=10): from langchain.document_loaders import WebBaseLoader - self.loader = WebBaseLoader - #No pages scraped limit, set None for no limit - self.limit = limit + self.loader = WebBaseLoader # Initialize the document loader + self.limit = limit # Set the limit for the number of pages to scrape def load_data(self, url): - # Create a set to store visited URLs to avoid revisiting the same page - visited_urls = set() + # Check if the input is a list and if it is, use the first element + if isinstance(url, list) and url: + url = url[0] - # Extract the base URL to ensure we only fetch URLs from the same domain - base_url = urlparse(url).scheme + "://" + urlparse(url).hostname + # Check if the URL scheme is provided, if not, assume http + if not urlparse(url).scheme: + url = "http://" + url - # Initialize a list with the initial URL - urls_to_visit = [url] + visited_urls = set() # Keep track of URLs that have been visited + base_url = urlparse(url).scheme + "://" + urlparse(url).hostname # Extract the base URL + urls_to_visit = [url] # List of URLs to be visited, starting with the initial URL + loaded_content = [] # Store the loaded content from each URL + # Continue crawling until there are no more URLs to visit while urls_to_visit: - current_url = urls_to_visit.pop(0) - visited_urls.add(current_url) + current_url = urls_to_visit.pop(0) # Get the next URL to visit + visited_urls.add(current_url) # Mark the URL as visited - # Fetch the content of the current URL - response = requests.get(current_url) - if response.status_code != 200: - print(f"Failed to fetch URL: {current_url}") + # Try to load and process the content from the current URL + try: + response = requests.get(current_url) # Fetch the content of the current URL + response.raise_for_status() # Raise an exception for HTTP errors + loader = self.loader([current_url]) # Initialize the document loader for the current URL + loaded_content.extend(loader.load()) # Load the content and add it to the loaded_content list + except Exception as e: + # Print an error message if loading or processing fails and continue with the next URL + print(f"Error processing URL {current_url}: {e}") continue - # Parse the HTML content + # Parse the HTML content to extract all links soup = BeautifulSoup(response.text, 'html.parser') + all_links = [ + urljoin(current_url, a['href']) + for a in soup.find_all('a', href=True) + if base_url in urljoin(current_url, a['href']) # Ensure links are from the same domain + ] - # Extract all links from the HTML content - all_links = [urljoin(current_url, a['href']) for a in soup.find_all('a', href=True) if base_url in urljoin(current_url, a['href'])] - - # Add the new links to the urls_to_visit list if they haven't been visited yet + # Add new links to the list of URLs to visit if they haven't been visited yet urls_to_visit.extend([link for link in all_links if link not in visited_urls]) + urls_to_visit = list(set(urls_to_visit)) # Remove duplicate URLs - # Remove duplicates - urls_to_visit = list(set(urls_to_visit)) - - # Stop if the limit is reached + # Stop crawling if the limit of pages to scrape is reached if self.limit is not None and len(visited_urls) >= self.limit: break - #TODO: Optimize this section to parse pages as they are being crawled - loaded_content = self.loader(list(visited_urls)).load() - - return loaded_content \ No newline at end of file + return loaded_content # Return the loaded content from all visited URLs diff --git a/application/parser/remote/remote_creator.py b/application/parser/remote/remote_creator.py index e12b7a02..e45333d4 100644 --- a/application/parser/remote/remote_creator.py +++ b/application/parser/remote/remote_creator.py @@ -1,13 +1,13 @@ -# from sitemap_loader import SitemapLoader -# from crawler_loader import CrawlerLoader +from application.parser.remote.sitemap_loader import SitemapLoader +from application.parser.remote.crawler_loader import CrawlerLoader from application.parser.remote.web_loader import WebLoader class RemoteCreator: loaders = { 'url': WebLoader, - # 'sitemap': SitemapLoader, - # 'crawler': CrawlerLoader + 'sitemap': SitemapLoader, + 'crawler': CrawlerLoader } @classmethod diff --git a/application/parser/remote/sitemap_loader.py b/application/parser/remote/sitemap_loader.py index 366d81ed..0a3f4d4c 100644 --- a/application/parser/remote/sitemap_loader.py +++ b/application/parser/remote/sitemap_loader.py @@ -1,27 +1,74 @@ import requests +import re # Import regular expression library import xml.etree.ElementTree as ET from application.parser.remote.base import BaseRemote class SitemapLoader(BaseRemote): - def __init__(self): + def __init__(self, limit=20): from langchain.document_loaders import WebBaseLoader self.loader = WebBaseLoader + self.limit = limit # Adding limit to control the number of URLs to process def load_data(self, sitemap_url): - # Fetch the sitemap content + urls = self._extract_urls(sitemap_url) + if not urls: + print(f"No URLs found in the sitemap: {sitemap_url}") + return [] + + # Load content of extracted URLs + documents = [] + processed_urls = 0 # Counter for processed URLs + for url in urls: + if self.limit is not None and processed_urls >= self.limit: + break # Stop processing if the limit is reached + + try: + loader = self.loader([url]) + documents.extend(loader.load()) + processed_urls += 1 # Increment the counter after processing each URL + except Exception as e: + print(f"Error processing URL {url}: {e}") + continue + + return documents + + def _extract_urls(self, sitemap_url): response = requests.get(sitemap_url) if response.status_code != 200: print(f"Failed to fetch sitemap: {sitemap_url}") - return None + return [] - # Parse the sitemap XML - root = ET.fromstring(response.content) + # Determine if this is a sitemap or a URL + if self._is_sitemap(response): + # It's a sitemap, so parse it and extract URLs + return self._parse_sitemap(response.content) + else: + # It's not a sitemap, return the URL itself + return [sitemap_url] - # Extract URLs from the sitemap - # The namespace with "loc" tag might be needed to extract URLs - ns = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'} - urls = [loc.text for loc in root.findall('s:url/s:loc', ns)] + def _is_sitemap(self, response): + content_type = response.headers.get('Content-Type', '') + if 'xml' in content_type or response.url.endswith('.xml'): + return True - # Use your existing loader to load content of extracted URLs - loader = self.loader(urls) - return loader.load() \ No newline at end of file + if ' Date: Thu, 12 Oct 2023 19:45:36 +0400 Subject: [PATCH 07/17] fix wrong link --- application/parser/remote/sitemap_loader.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/application/parser/remote/sitemap_loader.py b/application/parser/remote/sitemap_loader.py index 0a3f4d4c..a8700555 100644 --- a/application/parser/remote/sitemap_loader.py +++ b/application/parser/remote/sitemap_loader.py @@ -33,9 +33,11 @@ class SitemapLoader(BaseRemote): return documents def _extract_urls(self, sitemap_url): - response = requests.get(sitemap_url) - if response.status_code != 200: - print(f"Failed to fetch sitemap: {sitemap_url}") + try: + response = requests.get(sitemap_url) + response.raise_for_status() # Raise an exception for HTTP errors + except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError) as e: + print(f"Failed to fetch sitemap: {sitemap_url}. Error: {e}") return [] # Determine if this is a sitemap or a URL From 024674eef348e53ce7c54a4153d5bd852f00b350 Mon Sep 17 00:00:00 2001 From: Pavel Date: Fri, 13 Oct 2023 11:42:42 +0400 Subject: [PATCH 08/17] List check --- application/parser/remote/sitemap_loader.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/application/parser/remote/sitemap_loader.py b/application/parser/remote/sitemap_loader.py index a8700555..e2339ab7 100644 --- a/application/parser/remote/sitemap_loader.py +++ b/application/parser/remote/sitemap_loader.py @@ -10,6 +10,10 @@ class SitemapLoader(BaseRemote): self.limit = limit # Adding limit to control the number of URLs to process def load_data(self, sitemap_url): + # Check if the input is a list and if it is, use the first element + if isinstance(sitemap_url, list) and sitemap_url: + url = sitemap_url[0] + urls = self._extract_urls(sitemap_url) if not urls: print(f"No URLs found in the sitemap: {sitemap_url}") From 8b3b16bce4d834c2f26aad0f506eff0341459ed6 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 13 Oct 2023 08:46:35 +0100 Subject: [PATCH 09/17] inputs --- application/worker.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/application/worker.py b/application/worker.py index fe4e2615..444772d5 100644 --- a/application/worker.py +++ b/application/worker.py @@ -106,7 +106,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user): 'limited': False } -def remote_worker(self, urls, name_job, user, directory = 'temp', loader = 'url'): +def remote_worker(self, inputs, name_job, user, directory = 'temp', loader = 'url'): sample = False token_check = True min_tokens = 150 @@ -117,9 +117,11 @@ def remote_worker(self, urls, name_job, user, directory = 'temp', loader = 'url' os.makedirs(full_path) self.update_state(state='PROGRESS', meta={'current': 1}) - + + # inputs {"data": [url]} for url type task just urls + # Use RemoteCreator to load data from URL - remote_loader = RemoteCreator.create_loader(loader, urls) + remote_loader = RemoteCreator.create_loader(loader, inputs['data']) raw_docs = remote_loader.load_data() raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check) @@ -146,7 +148,7 @@ def remote_worker(self, urls, name_job, user, directory = 'temp', loader = 'url' shutil.rmtree(full_path) return { - 'urls': urls, + 'urls': inputs['data'], 'name_job': name_job, 'user': user, 'limited': False From 381a2740ee23101594066b2efe46b4dfe2d120b4 Mon Sep 17 00:00:00 2001 From: Pavel Date: Fri, 13 Oct 2023 21:52:56 +0400 Subject: [PATCH 10/17] change input --- application/parser/remote/crawler_loader.py | 3 ++- application/parser/remote/sitemap_loader.py | 5 +++-- application/parser/remote/web_loader.py | 4 +++- application/worker.py | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py index 380a25bf..ee037e59 100644 --- a/application/parser/remote/crawler_loader.py +++ b/application/parser/remote/crawler_loader.py @@ -9,7 +9,8 @@ class CrawlerLoader(BaseRemote): self.loader = WebBaseLoader # Initialize the document loader self.limit = limit # Set the limit for the number of pages to scrape - def load_data(self, url): + def load_data(self, inputs): + url = inputs['data'] # Check if the input is a list and if it is, use the first element if isinstance(url, list) and url: url = url[0] diff --git a/application/parser/remote/sitemap_loader.py b/application/parser/remote/sitemap_loader.py index e2339ab7..0748f104 100644 --- a/application/parser/remote/sitemap_loader.py +++ b/application/parser/remote/sitemap_loader.py @@ -9,11 +9,12 @@ class SitemapLoader(BaseRemote): self.loader = WebBaseLoader self.limit = limit # Adding limit to control the number of URLs to process - def load_data(self, sitemap_url): + def load_data(self, inputs): + sitemap_url= inputs['data'] # Check if the input is a list and if it is, use the first element if isinstance(sitemap_url, list) and sitemap_url: url = sitemap_url[0] - + urls = self._extract_urls(sitemap_url) if not urls: print(f"No URLs found in the sitemap: {sitemap_url}") diff --git a/application/parser/remote/web_loader.py b/application/parser/remote/web_loader.py index 4a55e1c5..e5cd2e2f 100644 --- a/application/parser/remote/web_loader.py +++ b/application/parser/remote/web_loader.py @@ -5,7 +5,9 @@ class WebLoader(BaseRemote): from langchain.document_loaders import WebBaseLoader self.loader = WebBaseLoader - def load_data(self, urls): + def load_data(self, inputs): + urls = inputs['data'] + if isinstance(urls, str): urls = [urls] # Convert string to list if a single URL is passed diff --git a/application/worker.py b/application/worker.py index 444772d5..90de0286 100644 --- a/application/worker.py +++ b/application/worker.py @@ -121,7 +121,7 @@ def remote_worker(self, inputs, name_job, user, directory = 'temp', loader = 'ur # inputs {"data": [url]} for url type task just urls # Use RemoteCreator to load data from URL - remote_loader = RemoteCreator.create_loader(loader, inputs['data']) + remote_loader = RemoteCreator.create_loader(loader, inputs) raw_docs = remote_loader.load_data() raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check) From d2dba3a0db8278ebec7f2e6571f77b0651c05bfd Mon Sep 17 00:00:00 2001 From: ManishMadan2882 Date: Tue, 13 Feb 2024 01:53:25 +0530 Subject: [PATCH 11/17] adding remote uploads tab --- frontend/src/upload/Upload.tsx | 182 +++++++++++++++++++++++++-------- frontend/tailwind.config.cjs | 5 +- 2 files changed, 146 insertions(+), 41 deletions(-) diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index f0735bc9..0e45e8c5 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -4,8 +4,61 @@ import { useDropzone } from 'react-dropzone'; import { useDispatch } from 'react-redux'; import { ActiveState } from '../models/misc'; import { getDocs } from '../preferences/preferenceApi'; +import Arrow2 from '../assets/dropdown-arrow.svg'; import { setSourceDocs } from '../preferences/preferenceSlice'; - +type urlOption = { + label: string, + value: string +} | null +function DropdownUrlType({ + options, + selectedOption, + onSelect, +}: { + options: urlOption[]; + selectedOption: urlOption; + onSelect: (value: urlOption) => void; +}) { + const [isOpen, setIsOpen] = useState(false); + return ( +
+ + {isOpen && ( +
+ {options.map((option, index) => ( +
+ { + onSelect(option); + setIsOpen(false); + }} + className="ml-2 flex-1 overflow-hidden overflow-ellipsis whitespace-nowrap px-1 py-3" + > + {option?.label} + +
+ ))} +
+ )} +
+ ); +} export default function Upload({ modalState, setModalState, @@ -14,6 +67,14 @@ export default function Upload({ setModalState: (state: ActiveState) => void; }) { const [docName, setDocName] = useState(''); + const [urlName, setUrlName] = useState('') + const [url, setUrl] = useState('') + const urlOptions: urlOption[] = [ + { label: 'Github', value: 'github' }, + { label: 'Sitemap', value: 'Sitemap' }, + { label: 'Link', value: 'link' }] + const [urlType, setUrlType] = useState(null) + const [activeTab, setActiveTab] = useState('file'); const [files, setfiles] = useState([]); const [progress, setProgress] = useState<{ type: 'UPLOAD' | 'TRAINIING'; @@ -55,9 +116,8 @@ export default function Upload({ setProgress(undefined); setModalState('INACTIVE'); }} - className={`rounded-3xl bg-purple-30 px-4 py-2 text-sm font-medium text-white ${ - isCancellable ? '' : 'hidden' - }`} + className={`rounded-3xl bg-purple-30 px-4 py-2 text-sm font-medium text-white ${isCancellable ? '' : 'hidden' + }`} > Finish @@ -166,7 +226,6 @@ export default function Upload({ ['.docx'], }, }); - let view; if (progress?.type === 'UPLOAD') { view = ; @@ -176,41 +235,85 @@ export default function Upload({ view = ( <>

Upload New Documentation

-

- Please upload .pdf, .txt, .rst, .docx, .md, .zip limited to 25mb -

- setDocName(e.target.value)} - > -
- Name -
-
- - - Choose Files - -
-
-

Uploaded Files

- {files.map((file) => ( -

- {file.name} -

- ))} - {files.length === 0 &&

None

} +
+ +
+ { + activeTab === 'file' && ( + <> + setDocName(e.target.value)} + > +
+ Name +
+
+ + + Choose Files + +
+

+ Please upload .pdf, .txt, .rst, .docx, .md, .zip limited to 25mb +

+
+

Uploaded Files

+ {files.map((file) => ( +

+ {file.name} +

+ ))} + {files.length === 0 &&

None

} +
+ + ) + } + { + activeTab === 'remote' && ( + <> + setUrlType(value)} selectedOption={urlType} options={urlOptions} /> + setUrlName(e.target.value)} + > +
+ Name +
+ setUrl(e.target.value)} + > +
+ Link +
+ + ) + }
@@ -232,9 +335,8 @@ export default function Upload({ return (
{view} diff --git a/frontend/tailwind.config.cjs b/frontend/tailwind.config.cjs index 5946c5a3..50af33c8 100644 --- a/frontend/tailwind.config.cjs +++ b/frontend/tailwind.config.cjs @@ -43,7 +43,10 @@ module.exports = { 'dark-charcoal':'#2F3036', 'bright-gray':'#ECECF1', 'outer-space':'#444654', - 'gun-metal':'#2E303E' + 'gun-metal':'#2E303E', + 'sonic-silver':'#747474', + 'soap':'#D8CCF1', + 'independence':'#54546D' }, }, }, From 030c2a740ff5c4dc034ab041c73218162d90eefb Mon Sep 17 00:00:00 2001 From: Pavel Date: Tue, 13 Feb 2024 23:41:36 +0300 Subject: [PATCH 12/17] upload_remote class --- application/api/user/routes.py | 28 +++++++++++++++++++++++++++- application/api/user/tasks.py | 7 ++++++- application/worker.py | 8 ++++---- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 86742572..592a82cd 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -5,7 +5,7 @@ from pymongo import MongoClient from bson.objectid import ObjectId from werkzeug.utils import secure_filename -from application.api.user.tasks import ingest +from application.api.user.tasks import ingest, ingest_remote from application.core.settings import settings from application.vectorstore.vector_creator import VectorCreator @@ -157,6 +157,32 @@ def upload_file(): return {"status": "ok", "task_id": task_id} else: return {"status": "error"} + +@user.route("/api/remote", methods=["POST"]) +def upload_remote(): + """Upload a remote source to get vectorized and indexed.""" + if "user" not in request.form: + return {"status": "no user"} + user = secure_filename(request.form["user"]) + if "source" not in request.form: + return {"status": "no source"} + source = secure_filename(request.form["source"]) + if "name" not in request.form: + return {"status": "no name"} + job_name = secure_filename(request.form["name"]) + # check if the post request has the file part + if "data" not in request.form: + print("No data") + return {"status": "no data"} + source_data = request.form["data"] + + if source_data: + task = ingest_remote.delay(source_data=source_data, job_name=job_name, user=user, loader=source) + # task id + task_id = task.id + return {"status": "ok", "task_id": task_id} + else: + return {"status": "error"} @user.route("/api/task_status", methods=["GET"]) def task_status(): diff --git a/application/api/user/tasks.py b/application/api/user/tasks.py index a3474939..4602bf85 100644 --- a/application/api/user/tasks.py +++ b/application/api/user/tasks.py @@ -1,7 +1,12 @@ -from application.worker import ingest_worker +from application.worker import ingest_worker, remote_worker from application.celery import celery @celery.task(bind=True) def ingest(self, directory, formats, name_job, filename, user): resp = ingest_worker(self, directory, formats, name_job, filename, user) return resp + +@celery.task(bind=True) +def ingest_remote(self, source_data, job_name, user, loader): + resp = remote_worker(self, source_data, job_name, user, loader) + return resp diff --git a/application/worker.py b/application/worker.py index 5fc28749..50344a26 100644 --- a/application/worker.py +++ b/application/worker.py @@ -123,7 +123,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user): 'limited': False } -def remote_worker(self, inputs, name_job, user, directory = 'temp', loader = 'url'): +def remote_worker(self, source_data, name_job, user, directory = 'temp', loader = 'url'): sample = False token_check = True min_tokens = 150 @@ -135,10 +135,10 @@ def remote_worker(self, inputs, name_job, user, directory = 'temp', loader = 'ur self.update_state(state='PROGRESS', meta={'current': 1}) - # inputs {"data": [url]} for url type task just urls + # source_data {"data": [url]} for url type task just urls # Use RemoteCreator to load data from URL - remote_loader = RemoteCreator.create_loader(loader, inputs) + remote_loader = RemoteCreator.create_loader(loader, source_data) raw_docs = remote_loader.load_data() raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check) @@ -165,7 +165,7 @@ def remote_worker(self, inputs, name_job, user, directory = 'temp', loader = 'ur shutil.rmtree(full_path) return { - 'urls': inputs['data'], + 'urls': source_data['data'], 'name_job': name_job, 'user': user, 'limited': False From 0cb3d12d94685535c9cba3af7436244805e4abe8 Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 14 Feb 2024 15:17:56 +0000 Subject: [PATCH 13/17] Refactor loader classes to accept inputs directly --- application/parser/remote/crawler_loader.py | 2 +- application/parser/remote/sitemap_loader.py | 2 +- application/parser/remote/web_loader.py | 2 +- application/worker.py | 14 ++++++-------- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py index ee037e59..2a63f284 100644 --- a/application/parser/remote/crawler_loader.py +++ b/application/parser/remote/crawler_loader.py @@ -10,7 +10,7 @@ class CrawlerLoader(BaseRemote): self.limit = limit # Set the limit for the number of pages to scrape def load_data(self, inputs): - url = inputs['data'] + url = inputs # Check if the input is a list and if it is, use the first element if isinstance(url, list) and url: url = url[0] diff --git a/application/parser/remote/sitemap_loader.py b/application/parser/remote/sitemap_loader.py index 0748f104..6e9182c4 100644 --- a/application/parser/remote/sitemap_loader.py +++ b/application/parser/remote/sitemap_loader.py @@ -10,7 +10,7 @@ class SitemapLoader(BaseRemote): self.limit = limit # Adding limit to control the number of URLs to process def load_data(self, inputs): - sitemap_url= inputs['data'] + sitemap_url= inputs # Check if the input is a list and if it is, use the first element if isinstance(sitemap_url, list) and sitemap_url: url = sitemap_url[0] diff --git a/application/parser/remote/web_loader.py b/application/parser/remote/web_loader.py index e5cd2e2f..9fc50c1c 100644 --- a/application/parser/remote/web_loader.py +++ b/application/parser/remote/web_loader.py @@ -6,7 +6,7 @@ class WebLoader(BaseRemote): self.loader = WebBaseLoader def load_data(self, inputs): - urls = inputs['data'] + urls = inputs if isinstance(urls, str): urls = [urls] # Convert string to list if a single URL is passed diff --git a/application/worker.py b/application/worker.py index 50344a26..875611bf 100644 --- a/application/worker.py +++ b/application/worker.py @@ -138,19 +138,17 @@ def remote_worker(self, source_data, name_job, user, directory = 'temp', loader # source_data {"data": [url]} for url type task just urls # Use RemoteCreator to load data from URL - remote_loader = RemoteCreator.create_loader(loader, source_data) - raw_docs = remote_loader.load_data() + remote_loader = RemoteCreator.create_loader(loader) + raw_docs = remote_loader.load_data(source_data) - raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check) + docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check) - docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + #docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] call_openai_api(docs, full_path, self) self.update_state(state='PROGRESS', meta={'current': 100}) + - if sample: - for i in range(min(5, len(raw_docs))): - print(raw_docs[i].text) # Proceed with uploading and cleaning as in the original function file_data = {'name': name_job, 'user': user} @@ -165,7 +163,7 @@ def remote_worker(self, source_data, name_job, user, directory = 'temp', loader shutil.rmtree(full_path) return { - 'urls': source_data['data'], + 'urls': source_data, 'name_job': name_job, 'user': user, 'limited': False From c4c0516820e7244d9b89ba26ba91c19ded343bda Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 26 Feb 2024 14:31:54 +0000 Subject: [PATCH 14/17] add endpoint --- frontend/src/upload/Upload.tsx | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index 0e45e8c5..e3ccad06 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -209,6 +209,29 @@ export default function Upload({ xhr.send(formData); }; + const uploadRemote = () => { + console.log("here") + const formData = new FormData(); + formData.append('name', urlName); + formData.append('user', 'local'); + if (urlType !== null) { + formData.append('source', urlType?.value); + } + formData.append('data', url); + const apiHost = import.meta.env.VITE_API_HOST; + const xhr = new XMLHttpRequest(); + xhr.upload.addEventListener('progress', (event) => { + const progress = +((event.loaded / event.total) * 100).toFixed(2); + setProgress({ type: 'UPLOAD', percentage: progress }); + }); + xhr.onload = () => { + const { task_id } = JSON.parse(xhr.responseText); + setProgress({ type: 'TRAINIING', percentage: 0, taskId: task_id }); + }; + xhr.open('POST', `${apiHost + '/api/remote'}`); + xhr.send(formData); + }; + const { getRootProps, getInputProps, isDragActive } = useDropzone({ onDrop, multiple: false, @@ -309,12 +332,12 @@ export default function Upload({ }
From 325a8889ab106bda471f08a4572dd70ccfcc3fed Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 27 Feb 2024 11:52:51 +0000 Subject: [PATCH 15/17] update url --- frontend/src/upload/Upload.tsx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index e3ccad06..d0d5bf6a 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -70,9 +70,9 @@ export default function Upload({ const [urlName, setUrlName] = useState('') const [url, setUrl] = useState('') const urlOptions: urlOption[] = [ - { label: 'Github', value: 'github' }, - { label: 'Sitemap', value: 'Sitemap' }, - { label: 'Link', value: 'link' }] + { label: 'Crawler', value: 'crawler' }, + { label: 'Sitemap', value: 'sitemap' }, + { label: 'Link', value: 'url' }] const [urlType, setUrlType] = useState(null) const [activeTab, setActiveTab] = useState('file'); const [files, setfiles] = useState([]); From 54d187a0ade1f2f82d66067b86107114db8eaee8 Mon Sep 17 00:00:00 2001 From: Pavel Date: Wed, 28 Feb 2024 19:52:58 +0300 Subject: [PATCH 16/17] Fixing ingestion metadata grouping --- .gitignore | 1 + application/parser/remote/base.py | 2 +- application/parser/remote/telegram.py | 4 ++-- application/worker.py | 8 ++++---- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 053e5793..1a5f0419 100644 --- a/.gitignore +++ b/.gitignore @@ -172,3 +172,4 @@ application/vectors/ node_modules/ .vscode/settings.json models/ +model/ \ No newline at end of file diff --git a/application/parser/remote/base.py b/application/parser/remote/base.py index 75ae34d5..91313f22 100644 --- a/application/parser/remote/base.py +++ b/application/parser/remote/base.py @@ -1,6 +1,6 @@ """Base reader class.""" from abc import abstractmethod -from typing import Any, List, Iterator +from typing import Any, List from langchain.docstore.document import Document as LCDocument from application.parser.schema.base import Document diff --git a/application/parser/remote/telegram.py b/application/parser/remote/telegram.py index 895d5cb3..0e691be4 100644 --- a/application/parser/remote/telegram.py +++ b/application/parser/remote/telegram.py @@ -1,4 +1,4 @@ -from langchain.document_loader import TelegramChatApiLoader, TelegramChatFileLoader +from langchain.document_loader import TelegramChatApiLoader from application.parser.remote.base import BaseRemote class TelegramChatApiRemote(BaseRemote): @@ -8,4 +8,4 @@ class TelegramChatApiRemote(BaseRemote): def parse_file(self, *args, **load_kwargs): - return text \ No newline at end of file + return \ No newline at end of file diff --git a/application/worker.py b/application/worker.py index 875611bf..21bb319f 100644 --- a/application/worker.py +++ b/application/worker.py @@ -124,7 +124,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user): } def remote_worker(self, source_data, name_job, user, directory = 'temp', loader = 'url'): - sample = False + # sample = False token_check = True min_tokens = 150 max_tokens = 1250 @@ -155,10 +155,10 @@ def remote_worker(self, source_data, name_job, user, directory = 'temp', loader if settings.VECTOR_STORE == "faiss": files = {'file_faiss': open(full_path + '/index.faiss', 'rb'), 'file_pkl': open(full_path + '/index.pkl', 'rb')} - response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data) - response = requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path)) + requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data) + requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path)) else: - response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data) + requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data) shutil.rmtree(full_path) From f4288f0bd4abc949b2a2c5ede16f1d770ba33216 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 1 Mar 2024 14:41:03 +0000 Subject: [PATCH 17/17] remove sitemap --- frontend/src/upload/Upload.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index d0d5bf6a..c9c5cde4 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -71,7 +71,7 @@ export default function Upload({ const [url, setUrl] = useState('') const urlOptions: urlOption[] = [ { label: 'Crawler', value: 'crawler' }, - { label: 'Sitemap', value: 'sitemap' }, + // { label: 'Sitemap', value: 'sitemap' }, { label: 'Link', value: 'url' }] const [urlType, setUrlType] = useState(null) const [activeTab, setActiveTab] = useState('file');