From 8f2ad38503603a5da686bedc94b79193d91e7764 Mon Sep 17 00:00:00 2001
From: Alex <a@tushynski.me>
Date: Wed, 11 Oct 2023 10:13:51 +0100
Subject: [PATCH 01/17] tests

---
 application/parser/remote/base.py     | 19 +++++++++++++++++++
 application/parser/remote/telegram.py | 11 +++++++++++
 2 files changed, 30 insertions(+)
 create mode 100644 application/parser/remote/base.py
 create mode 100644 application/parser/remote/telegram.py

diff --git a/application/parser/remote/base.py b/application/parser/remote/base.py
new file mode 100644
index 00000000..91313f22
--- /dev/null
+++ b/application/parser/remote/base.py
@@ -0,0 +1,19 @@
+"""Base reader class."""
+from abc import abstractmethod
+from typing import Any, List
+
+from langchain.docstore.document import Document as LCDocument
+from application.parser.schema.base import Document
+
+
+class BaseRemote:
+    """Utilities for loading data from a directory."""
+
+    @abstractmethod
+    def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]:
+        """Load data from the input directory."""
+
+    def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]:
+        """Load data in LangChain document format."""
+        docs = self.load_data(**load_kwargs)
+        return [d.to_langchain_format() for d in docs]
diff --git a/application/parser/remote/telegram.py b/application/parser/remote/telegram.py
new file mode 100644
index 00000000..895d5cb3
--- /dev/null
+++ b/application/parser/remote/telegram.py
@@ -0,0 +1,11 @@
+from langchain.document_loader import TelegramChatApiLoader, TelegramChatFileLoader
+from application.parser.remote.base import BaseRemote
+
+class TelegramChatApiRemote(BaseRemote):
+    def _init_parser(self, *args, **load_kwargs):
+        self.loader = TelegramChatApiLoader(**load_kwargs)
+        return {}
+
+    def parse_file(self, *args, **load_kwargs):
+
+        return text
\ No newline at end of file

From 658867cb46e253b8ae8b128f81ea5e50d999d613 Mon Sep 17 00:00:00 2001
From: Pavel <pabin@yandex.ru>
Date: Thu, 12 Oct 2023 01:03:40 +0400
Subject: [PATCH 02/17] No crawler, no sitemap

---
 application/parser/remote/base.py           |  2 +-
 application/parser/remote/crawler_loader.py |  0
 application/parser/remote/github_loader.py  |  0
 application/parser/remote/remote_creator.py | 18 ++++++++
 application/parser/remote/sitemap_loader.py |  0
 application/parser/remote/web_loader.py     | 10 +++++
 application/worker.py                       | 47 +++++++++++++++++++++
 7 files changed, 76 insertions(+), 1 deletion(-)
 create mode 100644 application/parser/remote/crawler_loader.py
 create mode 100644 application/parser/remote/github_loader.py
 create mode 100644 application/parser/remote/remote_creator.py
 create mode 100644 application/parser/remote/sitemap_loader.py
 create mode 100644 application/parser/remote/web_loader.py

diff --git a/application/parser/remote/base.py b/application/parser/remote/base.py
index 91313f22..75ae34d5 100644
--- a/application/parser/remote/base.py
+++ b/application/parser/remote/base.py
@@ -1,6 +1,6 @@
 """Base reader class."""
 from abc import abstractmethod
-from typing import Any, List
+from typing import Any, List, Iterator
 
 from langchain.docstore.document import Document as LCDocument
 from application.parser.schema.base import Document
diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py
new file mode 100644
index 00000000..e69de29b
diff --git a/application/parser/remote/github_loader.py b/application/parser/remote/github_loader.py
new file mode 100644
index 00000000..e69de29b
diff --git a/application/parser/remote/remote_creator.py b/application/parser/remote/remote_creator.py
new file mode 100644
index 00000000..e12b7a02
--- /dev/null
+++ b/application/parser/remote/remote_creator.py
@@ -0,0 +1,18 @@
+# from sitemap_loader import SitemapLoader
+# from crawler_loader import CrawlerLoader
+from application.parser.remote.web_loader import WebLoader
+
+
+class RemoteCreator:
+    loaders = {
+        'url': WebLoader,
+        # 'sitemap': SitemapLoader,
+        # 'crawler': CrawlerLoader
+    }
+
+    @classmethod
+    def create_loader(cls, type, *args, **kwargs):
+        loader_class = cls.loaders.get(type.lower())
+        if not loader_class:
+            raise ValueError(f"No LLM class found for type {type}")
+        return loader_class(*args, **kwargs)
\ No newline at end of file
diff --git a/application/parser/remote/sitemap_loader.py b/application/parser/remote/sitemap_loader.py
new file mode 100644
index 00000000..e69de29b
diff --git a/application/parser/remote/web_loader.py b/application/parser/remote/web_loader.py
new file mode 100644
index 00000000..ad2847e2
--- /dev/null
+++ b/application/parser/remote/web_loader.py
@@ -0,0 +1,10 @@
+from application.parser.remote.base import BaseRemote
+
+class WebLoader(BaseRemote):
+    def __init__(self):
+        from langchain.document_loaders import WebBaseLoader
+        self.loader = WebBaseLoader
+
+    def load_data(self, urls):
+        loader = self.loader(urls)
+        return loader.load()
\ No newline at end of file
diff --git a/application/worker.py b/application/worker.py
index 71fcd615..fe4e2615 100644
--- a/application/worker.py
+++ b/application/worker.py
@@ -9,6 +9,7 @@ import requests
 
 from application.core.settings import settings
 from application.parser.file.bulk import SimpleDirectoryReader
+from application.parser.remote.remote_creator import RemoteCreator
 from application.parser.open_ai_func import call_openai_api
 from application.parser.schema.base import Document
 from application.parser.token_func import group_split
@@ -104,3 +105,49 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
         'user': user,
         'limited': False
     }
+
+def remote_worker(self, urls, name_job, user, directory = 'temp', loader = 'url'):
+    sample = False
+    token_check = True
+    min_tokens = 150
+    max_tokens = 1250
+    full_path = directory + '/' + user + '/' + name_job
+
+    if not os.path.exists(full_path):
+        os.makedirs(full_path)
+
+    self.update_state(state='PROGRESS', meta={'current': 1})
+
+    # Use RemoteCreator to load data from URL
+    remote_loader = RemoteCreator.create_loader(loader, urls)
+    raw_docs = remote_loader.load_data()
+
+    raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)
+
+    docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
+
+    call_openai_api(docs, full_path, self)
+    self.update_state(state='PROGRESS', meta={'current': 100})
+
+    if sample:
+        for i in range(min(5, len(raw_docs))):
+            print(raw_docs[i].text)
+
+    # Proceed with uploading and cleaning as in the original function
+    file_data = {'name': name_job, 'user': user}
+    if settings.VECTOR_STORE == "faiss":
+        files = {'file_faiss': open(full_path + '/index.faiss', 'rb'),
+                 'file_pkl': open(full_path + '/index.pkl', 'rb')}
+        response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data)
+        response = requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path))
+    else:
+        response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data)
+
+    shutil.rmtree(full_path)
+
+    return {
+        'urls': urls,
+        'name_job': name_job,
+        'user': user,
+        'limited': False
+    }
\ No newline at end of file

From c517bdd2e162d4a17c31143b2c352a7055b91a36 Mon Sep 17 00:00:00 2001
From: Pavel <pabin@yandex.ru>
Date: Thu, 12 Oct 2023 12:35:26 +0400
Subject: [PATCH 03/17] Crawler + sitemap

---
 application/parser/remote/crawler_loader.py | 36 +++++++++++++++++++++
 application/parser/remote/sitemap_loader.py | 27 ++++++++++++++++
 2 files changed, 63 insertions(+)

diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py
index e69de29b..208195cb 100644
--- a/application/parser/remote/crawler_loader.py
+++ b/application/parser/remote/crawler_loader.py
@@ -0,0 +1,36 @@
+import requests
+from urllib.parse import urlparse, urljoin
+from bs4 import BeautifulSoup
+from application.parser.remote.base import BaseRemote
+
+class CrawlerLoader(BaseRemote):
+    def __init__(self):
+        from langchain.document_loaders import WebBaseLoader
+        self.loader = WebBaseLoader
+
+    def load_data(self, url):
+        # Fetch the content of the initial URL
+        response = requests.get(url)
+        if response.status_code != 200:
+            print(f"Failed to fetch initial URL: {url}")
+            return None
+
+        # Parse the HTML content
+        soup = BeautifulSoup(response.text, 'html.parser')
+
+        # Extract the base URL to ensure we only fetch URLs from the same domain
+        base_url = urlparse(url).scheme + "://" + urlparse(url).hostname
+
+        # Extract all links from the HTML content
+        all_links = [a['href'] for a in soup.find_all('a', href=True)]
+
+        # Filter out the links that lead to a different domain
+        same_domain_links = [urljoin(base_url, link) for link in all_links if base_url in urljoin(base_url, link)]
+
+        # Remove duplicates
+        same_domain_links = list(set(same_domain_links))
+
+        #TODO: Optimize this section to parse pages as they are being crawled
+        loaded_content = self.loader(same_domain_links).load()
+
+        return loaded_content
diff --git a/application/parser/remote/sitemap_loader.py b/application/parser/remote/sitemap_loader.py
index e69de29b..366d81ed 100644
--- a/application/parser/remote/sitemap_loader.py
+++ b/application/parser/remote/sitemap_loader.py
@@ -0,0 +1,27 @@
+import requests
+import xml.etree.ElementTree as ET
+from application.parser.remote.base import BaseRemote
+
+class SitemapLoader(BaseRemote):
+    def __init__(self):
+        from langchain.document_loaders import WebBaseLoader
+        self.loader = WebBaseLoader
+
+    def load_data(self, sitemap_url):
+        # Fetch the sitemap content
+        response = requests.get(sitemap_url)
+        if response.status_code != 200:
+            print(f"Failed to fetch sitemap: {sitemap_url}")
+            return None
+
+        # Parse the sitemap XML
+        root = ET.fromstring(response.content)
+
+        # Extract URLs from the sitemap
+        # The namespace with "loc" tag might be needed to extract URLs
+        ns = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
+        urls = [loc.text for loc in root.findall('s:url/s:loc', ns)]
+
+        # Use your existing loader to load content of extracted URLs
+        loader = self.loader(urls)
+        return loader.load()
\ No newline at end of file

From 50f07f9ef5f64292ba6e9f6e7b9cc2b2bafc60c0 Mon Sep 17 00:00:00 2001
From: Pavel <pabin@yandex.ru>
Date: Thu, 12 Oct 2023 12:53:33 +0400
Subject: [PATCH 04/17] limit crawler

---
 application/parser/remote/crawler_loader.py | 49 ++++++++++++++-------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py
index 208195cb..9acb8f39 100644
--- a/application/parser/remote/crawler_loader.py
+++ b/application/parser/remote/crawler_loader.py
@@ -4,33 +4,48 @@ from bs4 import BeautifulSoup
 from application.parser.remote.base import BaseRemote
 
 class CrawlerLoader(BaseRemote):
-    def __init__(self):
+    def __init__(self, limit=10):
         from langchain.document_loaders import WebBaseLoader
         self.loader = WebBaseLoader
+        self.limit = limit
 
     def load_data(self, url):
-        # Fetch the content of the initial URL
-        response = requests.get(url)
-        if response.status_code != 200:
-            print(f"Failed to fetch initial URL: {url}")
-            return None
-
-        # Parse the HTML content
-        soup = BeautifulSoup(response.text, 'html.parser')
+        # Create a set to store visited URLs to avoid revisiting the same page
+        visited_urls = set()
 
         # Extract the base URL to ensure we only fetch URLs from the same domain
         base_url = urlparse(url).scheme + "://" + urlparse(url).hostname
 
-        # Extract all links from the HTML content
-        all_links = [a['href'] for a in soup.find_all('a', href=True)]
+        # Initialize a list with the initial URL
+        urls_to_visit = [url]
 
-        # Filter out the links that lead to a different domain
-        same_domain_links = [urljoin(base_url, link) for link in all_links if base_url in urljoin(base_url, link)]
+        while urls_to_visit:
+            current_url = urls_to_visit.pop(0)
+            visited_urls.add(current_url)
 
-        # Remove duplicates
-        same_domain_links = list(set(same_domain_links))
+            # Fetch the content of the current URL
+            response = requests.get(current_url)
+            if response.status_code != 200:
+                print(f"Failed to fetch URL: {current_url}")
+                continue
+
+            # Parse the HTML content
+            soup = BeautifulSoup(response.text, 'html.parser')
+
+            # Extract all links from the HTML content
+            all_links = [urljoin(current_url, a['href']) for a in soup.find_all('a', href=True) if base_url in urljoin(current_url, a['href'])]
+
+            # Add the new links to the urls_to_visit list if they haven't been visited yet
+            urls_to_visit.extend([link for link in all_links if link not in visited_urls])
+
+            # Remove duplicates
+            urls_to_visit = list(set(urls_to_visit))
+
+            # Stop if the limit is reached
+            if self.limit is not None and len(visited_urls) >= self.limit:
+                break
 
         #TODO: Optimize this section to parse pages as they are being crawled
-        loaded_content = self.loader(same_domain_links).load()
+        loaded_content = self.loader(list(visited_urls)).load()
 
-        return loaded_content
+        return loaded_content
\ No newline at end of file

From 2cfb416fd053a5e32c508ee31aead6b055b3aedf Mon Sep 17 00:00:00 2001
From: Pavel <pabin@yandex.ru>
Date: Thu, 12 Oct 2023 13:44:32 +0400
Subject: [PATCH 05/17] Desc loader

---
 application/parser/remote/crawler_loader.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py
index 9acb8f39..2364dc27 100644
--- a/application/parser/remote/crawler_loader.py
+++ b/application/parser/remote/crawler_loader.py
@@ -7,6 +7,7 @@ class CrawlerLoader(BaseRemote):
     def __init__(self, limit=10):
         from langchain.document_loaders import WebBaseLoader
         self.loader = WebBaseLoader
+        #No pages scraped limit, set None for no limit
         self.limit = limit
 
     def load_data(self, url):

From 719ca63ec126e270334b359a54c217e995dcca31 Mon Sep 17 00:00:00 2001
From: Pavel <pabin@yandex.ru>
Date: Thu, 12 Oct 2023 19:40:23 +0400
Subject: [PATCH 06/17] fixes

---
 application/parser/remote/crawler_loader.py | 62 ++++++++++--------
 application/parser/remote/remote_creator.py |  8 +--
 application/parser/remote/sitemap_loader.py | 71 +++++++++++++++++----
 application/parser/remote/web_loader.py     | 14 +++-
 4 files changed, 109 insertions(+), 46 deletions(-)

diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py
index 2364dc27..380a25bf 100644
--- a/application/parser/remote/crawler_loader.py
+++ b/application/parser/remote/crawler_loader.py
@@ -6,47 +6,53 @@ from application.parser.remote.base import BaseRemote
 class CrawlerLoader(BaseRemote):
     def __init__(self, limit=10):
         from langchain.document_loaders import WebBaseLoader
-        self.loader = WebBaseLoader
-        #No pages scraped limit, set None for no limit
-        self.limit = limit
+        self.loader = WebBaseLoader  # Initialize the document loader
+        self.limit = limit  # Set the limit for the number of pages to scrape
 
     def load_data(self, url):
-        # Create a set to store visited URLs to avoid revisiting the same page
-        visited_urls = set()
+        # Check if the input is a list and if it is, use the first element
+        if isinstance(url, list) and url:
+            url = url[0]
 
-        # Extract the base URL to ensure we only fetch URLs from the same domain
-        base_url = urlparse(url).scheme + "://" + urlparse(url).hostname
+        # Check if the URL scheme is provided, if not, assume http
+        if not urlparse(url).scheme:
+            url = "http://" + url
 
-        # Initialize a list with the initial URL
-        urls_to_visit = [url]
+        visited_urls = set()  # Keep track of URLs that have been visited
+        base_url = urlparse(url).scheme + "://" + urlparse(url).hostname  # Extract the base URL
+        urls_to_visit = [url]  # List of URLs to be visited, starting with the initial URL
+        loaded_content = []  # Store the loaded content from each URL
 
+        # Continue crawling until there are no more URLs to visit
         while urls_to_visit:
-            current_url = urls_to_visit.pop(0)
-            visited_urls.add(current_url)
+            current_url = urls_to_visit.pop(0)  # Get the next URL to visit
+            visited_urls.add(current_url)  # Mark the URL as visited
 
-            # Fetch the content of the current URL
-            response = requests.get(current_url)
-            if response.status_code != 200:
-                print(f"Failed to fetch URL: {current_url}")
+            # Try to load and process the content from the current URL
+            try:
+                response = requests.get(current_url)  # Fetch the content of the current URL
+                response.raise_for_status()  # Raise an exception for HTTP errors
+                loader = self.loader([current_url])  # Initialize the document loader for the current URL
+                loaded_content.extend(loader.load())  # Load the content and add it to the loaded_content list
+            except Exception as e:
+                # Print an error message if loading or processing fails and continue with the next URL
+                print(f"Error processing URL {current_url}: {e}")
                 continue
 
-            # Parse the HTML content
+            # Parse the HTML content to extract all links
             soup = BeautifulSoup(response.text, 'html.parser')
+            all_links = [
+                urljoin(current_url, a['href'])
+                for a in soup.find_all('a', href=True)
+                if base_url in urljoin(current_url, a['href'])  # Ensure links are from the same domain
+            ]
 
-            # Extract all links from the HTML content
-            all_links = [urljoin(current_url, a['href']) for a in soup.find_all('a', href=True) if base_url in urljoin(current_url, a['href'])]
-
-            # Add the new links to the urls_to_visit list if they haven't been visited yet
+            # Add new links to the list of URLs to visit if they haven't been visited yet
             urls_to_visit.extend([link for link in all_links if link not in visited_urls])
+            urls_to_visit = list(set(urls_to_visit))  # Remove duplicate URLs
 
-            # Remove duplicates
-            urls_to_visit = list(set(urls_to_visit))
-
-            # Stop if the limit is reached
+            # Stop crawling if the limit of pages to scrape is reached
             if self.limit is not None and len(visited_urls) >= self.limit:
                 break
 
-        #TODO: Optimize this section to parse pages as they are being crawled
-        loaded_content = self.loader(list(visited_urls)).load()
-
-        return loaded_content
\ No newline at end of file
+        return loaded_content  # Return the loaded content from all visited URLs
diff --git a/application/parser/remote/remote_creator.py b/application/parser/remote/remote_creator.py
index e12b7a02..e45333d4 100644
--- a/application/parser/remote/remote_creator.py
+++ b/application/parser/remote/remote_creator.py
@@ -1,13 +1,13 @@
-# from sitemap_loader import SitemapLoader
-# from crawler_loader import CrawlerLoader
+from application.parser.remote.sitemap_loader import SitemapLoader
+from application.parser.remote.crawler_loader import CrawlerLoader
 from application.parser.remote.web_loader import WebLoader
 
 
 class RemoteCreator:
     loaders = {
         'url': WebLoader,
-        # 'sitemap': SitemapLoader,
-        # 'crawler': CrawlerLoader
+        'sitemap': SitemapLoader,
+        'crawler': CrawlerLoader
     }
 
     @classmethod
diff --git a/application/parser/remote/sitemap_loader.py b/application/parser/remote/sitemap_loader.py
index 366d81ed..0a3f4d4c 100644
--- a/application/parser/remote/sitemap_loader.py
+++ b/application/parser/remote/sitemap_loader.py
@@ -1,27 +1,74 @@
 import requests
+import re  # Import regular expression library
 import xml.etree.ElementTree as ET
 from application.parser.remote.base import BaseRemote
 
 class SitemapLoader(BaseRemote):
-    def __init__(self):
+    def __init__(self, limit=20):
         from langchain.document_loaders import WebBaseLoader
         self.loader = WebBaseLoader
+        self.limit = limit  # Adding limit to control the number of URLs to process
 
     def load_data(self, sitemap_url):
-        # Fetch the sitemap content
+        urls = self._extract_urls(sitemap_url)
+        if not urls:
+            print(f"No URLs found in the sitemap: {sitemap_url}")
+            return []
+
+        # Load content of extracted URLs
+        documents = []
+        processed_urls = 0  # Counter for processed URLs
+        for url in urls:
+            if self.limit is not None and processed_urls >= self.limit:
+                break  # Stop processing if the limit is reached
+
+            try:
+                loader = self.loader([url])
+                documents.extend(loader.load())
+                processed_urls += 1  # Increment the counter after processing each URL
+            except Exception as e:
+                print(f"Error processing URL {url}: {e}")
+                continue
+
+        return documents
+
+    def _extract_urls(self, sitemap_url):
         response = requests.get(sitemap_url)
         if response.status_code != 200:
             print(f"Failed to fetch sitemap: {sitemap_url}")
-            return None
+            return []
 
-        # Parse the sitemap XML
-        root = ET.fromstring(response.content)
+        # Determine if this is a sitemap or a URL
+        if self._is_sitemap(response):
+            # It's a sitemap, so parse it and extract URLs
+            return self._parse_sitemap(response.content)
+        else:
+            # It's not a sitemap, return the URL itself
+            return [sitemap_url]
 
-        # Extract URLs from the sitemap
-        # The namespace with "loc" tag might be needed to extract URLs
-        ns = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
-        urls = [loc.text for loc in root.findall('s:url/s:loc', ns)]
+    def _is_sitemap(self, response):
+        content_type = response.headers.get('Content-Type', '')
+        if 'xml' in content_type or response.url.endswith('.xml'):
+            return True
 
-        # Use your existing loader to load content of extracted URLs
-        loader = self.loader(urls)
-        return loader.load()
\ No newline at end of file
+        if '<sitemapindex' in response.text or '<urlset' in response.text:
+            return True
+
+        return False
+
+    def _parse_sitemap(self, sitemap_content):
+        # Remove namespaces
+        sitemap_content = re.sub(' xmlns="[^"]+"', '', sitemap_content.decode('utf-8'), count=1)
+
+        root = ET.fromstring(sitemap_content)
+
+        urls = []
+        for loc in root.findall('.//url/loc'):
+            urls.append(loc.text)
+
+        # Check for nested sitemaps
+        for sitemap in root.findall('.//sitemap/loc'):
+            nested_sitemap_url = sitemap.text
+            urls.extend(self._extract_urls(nested_sitemap_url))
+
+        return urls
diff --git a/application/parser/remote/web_loader.py b/application/parser/remote/web_loader.py
index ad2847e2..4a55e1c5 100644
--- a/application/parser/remote/web_loader.py
+++ b/application/parser/remote/web_loader.py
@@ -6,5 +6,15 @@ class WebLoader(BaseRemote):
         self.loader = WebBaseLoader
 
     def load_data(self, urls):
-        loader = self.loader(urls)
-        return loader.load()
\ No newline at end of file
+        if isinstance(urls, str):
+            urls = [urls] # Convert string to list if a single URL is passed
+
+        documents = []
+        for url in urls:
+            try:
+                loader = self.loader([url])  # Process URLs one by one
+                documents.extend(loader.load())
+            except Exception as e:
+                print(f"Error processing URL {url}: {e}")
+                continue  # Continue with the next URL if an error occurs
+        return documents
\ No newline at end of file

From b7d88b4c0f56f1a0197930be0da0a3080b089e4e Mon Sep 17 00:00:00 2001
From: Pavel <pabin@yandex.ru>
Date: Thu, 12 Oct 2023 19:45:36 +0400
Subject: [PATCH 07/17] fix wrong link

---
 application/parser/remote/sitemap_loader.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/application/parser/remote/sitemap_loader.py b/application/parser/remote/sitemap_loader.py
index 0a3f4d4c..a8700555 100644
--- a/application/parser/remote/sitemap_loader.py
+++ b/application/parser/remote/sitemap_loader.py
@@ -33,9 +33,11 @@ class SitemapLoader(BaseRemote):
         return documents
 
     def _extract_urls(self, sitemap_url):
-        response = requests.get(sitemap_url)
-        if response.status_code != 200:
-            print(f"Failed to fetch sitemap: {sitemap_url}")
+        try:
+            response = requests.get(sitemap_url)
+            response.raise_for_status()  # Raise an exception for HTTP errors
+        except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError) as e:
+            print(f"Failed to fetch sitemap: {sitemap_url}. Error: {e}")
             return []
 
         # Determine if this is a sitemap or a URL

From 024674eef348e53ce7c54a4153d5bd852f00b350 Mon Sep 17 00:00:00 2001
From: Pavel <pabin@yandex.ru>
Date: Fri, 13 Oct 2023 11:42:42 +0400
Subject: [PATCH 08/17] List check

---
 application/parser/remote/sitemap_loader.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/application/parser/remote/sitemap_loader.py b/application/parser/remote/sitemap_loader.py
index a8700555..e2339ab7 100644
--- a/application/parser/remote/sitemap_loader.py
+++ b/application/parser/remote/sitemap_loader.py
@@ -10,6 +10,10 @@ class SitemapLoader(BaseRemote):
         self.limit = limit  # Adding limit to control the number of URLs to process
 
     def load_data(self, sitemap_url):
+        # Check if the input is a list and if it is, use the first element
+        if isinstance(sitemap_url, list) and sitemap_url:
+            url = sitemap_url[0]
+            
         urls = self._extract_urls(sitemap_url)
         if not urls:
             print(f"No URLs found in the sitemap: {sitemap_url}")

From 8b3b16bce4d834c2f26aad0f506eff0341459ed6 Mon Sep 17 00:00:00 2001
From: Alex <a@tushynski.me>
Date: Fri, 13 Oct 2023 08:46:35 +0100
Subject: [PATCH 09/17] inputs

---
 application/worker.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/application/worker.py b/application/worker.py
index fe4e2615..444772d5 100644
--- a/application/worker.py
+++ b/application/worker.py
@@ -106,7 +106,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
         'limited': False
     }
 
-def remote_worker(self, urls, name_job, user, directory = 'temp', loader = 'url'):
+def remote_worker(self, inputs, name_job, user, directory = 'temp', loader = 'url'):
     sample = False
     token_check = True
     min_tokens = 150
@@ -117,9 +117,11 @@ def remote_worker(self, urls, name_job, user, directory = 'temp', loader = 'url'
         os.makedirs(full_path)
 
     self.update_state(state='PROGRESS', meta={'current': 1})
-
+ 
+    # inputs {"data": [url]} for url type task just urls
+ 
     # Use RemoteCreator to load data from URL
-    remote_loader = RemoteCreator.create_loader(loader, urls)
+    remote_loader = RemoteCreator.create_loader(loader, inputs['data'])
     raw_docs = remote_loader.load_data()
 
     raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)
@@ -146,7 +148,7 @@ def remote_worker(self, urls, name_job, user, directory = 'temp', loader = 'url'
     shutil.rmtree(full_path)
 
     return {
-        'urls': urls,
+        'urls': inputs['data'],
         'name_job': name_job,
         'user': user,
         'limited': False

From 381a2740ee23101594066b2efe46b4dfe2d120b4 Mon Sep 17 00:00:00 2001
From: Pavel <pabin@yandex.ru>
Date: Fri, 13 Oct 2023 21:52:56 +0400
Subject: [PATCH 10/17] change input

---
 application/parser/remote/crawler_loader.py | 3 ++-
 application/parser/remote/sitemap_loader.py | 5 +++--
 application/parser/remote/web_loader.py     | 4 +++-
 application/worker.py                       | 2 +-
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py
index 380a25bf..ee037e59 100644
--- a/application/parser/remote/crawler_loader.py
+++ b/application/parser/remote/crawler_loader.py
@@ -9,7 +9,8 @@ class CrawlerLoader(BaseRemote):
         self.loader = WebBaseLoader  # Initialize the document loader
         self.limit = limit  # Set the limit for the number of pages to scrape
 
-    def load_data(self, url):
+    def load_data(self, inputs):
+        url = inputs['data']
         # Check if the input is a list and if it is, use the first element
         if isinstance(url, list) and url:
             url = url[0]
diff --git a/application/parser/remote/sitemap_loader.py b/application/parser/remote/sitemap_loader.py
index e2339ab7..0748f104 100644
--- a/application/parser/remote/sitemap_loader.py
+++ b/application/parser/remote/sitemap_loader.py
@@ -9,11 +9,12 @@ class SitemapLoader(BaseRemote):
         self.loader = WebBaseLoader
         self.limit = limit  # Adding limit to control the number of URLs to process
 
-    def load_data(self, sitemap_url):
+    def load_data(self, inputs):
+        sitemap_url= inputs['data']
         # Check if the input is a list and if it is, use the first element
         if isinstance(sitemap_url, list) and sitemap_url:
             url = sitemap_url[0]
-            
+
         urls = self._extract_urls(sitemap_url)
         if not urls:
             print(f"No URLs found in the sitemap: {sitemap_url}")
diff --git a/application/parser/remote/web_loader.py b/application/parser/remote/web_loader.py
index 4a55e1c5..e5cd2e2f 100644
--- a/application/parser/remote/web_loader.py
+++ b/application/parser/remote/web_loader.py
@@ -5,7 +5,9 @@ class WebLoader(BaseRemote):
         from langchain.document_loaders import WebBaseLoader
         self.loader = WebBaseLoader
 
-    def load_data(self, urls):
+    def load_data(self, inputs):
+        urls = inputs['data']
+
         if isinstance(urls, str):
             urls = [urls] # Convert string to list if a single URL is passed
 
diff --git a/application/worker.py b/application/worker.py
index 444772d5..90de0286 100644
--- a/application/worker.py
+++ b/application/worker.py
@@ -121,7 +121,7 @@ def remote_worker(self, inputs, name_job, user, directory = 'temp', loader = 'ur
     # inputs {"data": [url]} for url type task just urls
  
     # Use RemoteCreator to load data from URL
-    remote_loader = RemoteCreator.create_loader(loader, inputs['data'])
+    remote_loader = RemoteCreator.create_loader(loader, inputs)
     raw_docs = remote_loader.load_data()
 
     raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)

From d2dba3a0db8278ebec7f2e6571f77b0651c05bfd Mon Sep 17 00:00:00 2001
From: ManishMadan2882 <manishmadan321@gmail.com>
Date: Tue, 13 Feb 2024 01:53:25 +0530
Subject: [PATCH 11/17] adding remote uploads tab

---
 frontend/src/upload/Upload.tsx | 182 +++++++++++++++++++++++++--------
 frontend/tailwind.config.cjs   |   5 +-
 2 files changed, 146 insertions(+), 41 deletions(-)

diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx
index f0735bc9..0e45e8c5 100644
--- a/frontend/src/upload/Upload.tsx
+++ b/frontend/src/upload/Upload.tsx
@@ -4,8 +4,61 @@ import { useDropzone } from 'react-dropzone';
 import { useDispatch } from 'react-redux';
 import { ActiveState } from '../models/misc';
 import { getDocs } from '../preferences/preferenceApi';
+import Arrow2 from '../assets/dropdown-arrow.svg';
 import { setSourceDocs } from '../preferences/preferenceSlice';
-
+type urlOption = {
+  label: string,
+  value: string
+} | null
+function DropdownUrlType({
+  options,
+  selectedOption,
+  onSelect,
+}: {
+  options: urlOption[];
+  selectedOption: urlOption;
+  onSelect: (value: urlOption) => void;
+}) {
+  const [isOpen, setIsOpen] = useState(false);
+  return (
+    <div className="relative w-full align-middle">
+      <button
+        onClick={() => setIsOpen(!isOpen)}
+        className={`${isOpen ? 'rounded-t-2xl' : 'rounded-full'} flex w-full cursor-pointer justify-between border-2 border-silver dark:border-chinese-silver bg-white p-3 dark:bg-transparent`}
+      >
+        <span className={`overflow-hidden text-ellipsis dark:text-bright-gray ${!selectedOption && 'text-silver'}`}>
+          {selectedOption ? selectedOption.label : 'From URL'}
+        </span>
+        <img
+          src={Arrow2}
+          alt="arrow"
+          className={`transform ${isOpen ? 'rotate-180' : 'rotate-0'
+            } h-3 w-3 transition-transform mt-1`}
+        />
+      </button>
+      {isOpen && (
+        <div className="absolute left-0 right-0 z-50 -mt-1 rounded-b-xl border-2 border-silver dark:border-chinese-silver bg-white dark:bg-dark-charcoal  shadow-lg">
+          {options.map((option, index) => (
+            <div
+              key={index}
+              className="flex cursor-pointer items-center justify-between hover:bg-gray-100 dark:hover:bg-purple-taupe dark:text-bright-gray text-sonic-silver hover:eerie-black "
+            >
+              <span
+                onClick={() => {
+                  onSelect(option);
+                  setIsOpen(false);
+                }}
+                className="ml-2 flex-1 overflow-hidden overflow-ellipsis whitespace-nowrap px-1 py-3"
+              >
+                {option?.label}
+              </span>
+            </div>
+          ))}
+        </div>
+      )}
+    </div>
+  );
+}
 export default function Upload({
   modalState,
   setModalState,
@@ -14,6 +67,14 @@ export default function Upload({
   setModalState: (state: ActiveState) => void;
 }) {
   const [docName, setDocName] = useState('');
+  const [urlName, setUrlName] = useState('')
+  const [url, setUrl] = useState('')
+  const urlOptions: urlOption[] = [
+    { label: 'Github', value: 'github' },
+    { label: 'Sitemap', value: 'Sitemap' },
+    { label: 'Link', value: 'link' }]
+  const [urlType, setUrlType] = useState<urlOption>(null)
+  const [activeTab, setActiveTab] = useState<string>('file');
   const [files, setfiles] = useState<File[]>([]);
   const [progress, setProgress] = useState<{
     type: 'UPLOAD' | 'TRAINIING';
@@ -55,9 +116,8 @@ export default function Upload({
             setProgress(undefined);
             setModalState('INACTIVE');
           }}
-          className={`rounded-3xl bg-purple-30 px-4 py-2 text-sm font-medium text-white ${
-            isCancellable ? '' : 'hidden'
-          }`}
+          className={`rounded-3xl bg-purple-30 px-4 py-2 text-sm font-medium text-white ${isCancellable ? '' : 'hidden'
+            }`}
         >
           Finish
         </button>
@@ -166,7 +226,6 @@ export default function Upload({
         ['.docx'],
     },
   });
-
   let view;
   if (progress?.type === 'UPLOAD') {
     view = <UploadProgress></UploadProgress>;
@@ -176,41 +235,85 @@ export default function Upload({
     view = (
       <>
         <p className="text-xl text-jet dark:text-bright-gray">Upload New Documentation</p>
-        <p className="mb-3 text-xs text-gray-4000">
-          Please upload .pdf, .txt, .rst, .docx, .md, .zip limited to 25mb
-        </p>
-        <input
-          type="text"
-          className="h-10 w-[60%] rounded-md border-2 border-gray-5000 dark:text-silver dark:bg-transparent px-3 outline-none"
-          value={docName}
-          onChange={(e) => setDocName(e.target.value)}
-        ></input>
-        <div className="relative bottom-12 left-2 mt-[-18.39px]">
-          <span className="bg-white px-2 text-xs text-gray-4000 dark:text-silver dark:bg-outer-space">Name</span>
-        </div>
-        <div {...getRootProps()}>
-          <span className="rounded-3xl border border-purple-30 dark:bg-purple-taupe px-4 py-2 font-medium text-purple-30 dark:text-silver hover:cursor-pointer">
-            <input type="button" {...getInputProps()} />
-            Choose Files
-          </span>
-        </div>
-        <div className="mt-9">
-          <p className="mb-5 font-medium text-eerie-black dark:text-light-gray">Uploaded Files</p>
-          {files.map((file) => (
-            <p key={file.name} className="text-gray-6000">
-              {file.name}
-            </p>
-          ))}
-          {files.length === 0 && <p className="text-gray-6000 dark:text-light-gray">None</p>}
+        <div >
+          <button
+            onClick={() => setActiveTab('file')}
+            className={`${activeTab === 'file' ? 'bg-soap text-purple-30 dark:bg-independence dark:text-purple-400' : 'text-sonic-silver  hover:text-purple-30'} text-sm font-semibold mr-4 px-[20px] py-[5px] rounded-full`}>
+            From File
+          </button>
+          <button
+            onClick={() => setActiveTab('remote')}
+            className={`${activeTab === 'remote' ? 'bg-soap text-purple-30 dark:bg-independence dark:text-purple-400' : 'text-sonic-silver  hover:text-purple-30'} text-sm font-semibold mr-4 px-[20px] py-[5px] rounded-full`}>
+            Remote
+          </button>
         </div>
+        {
+          activeTab === 'file' && (
+            <>
+              <input
+                type="text"
+                className="h-10 w-full rounded-full border-2 border-gray-5000 dark:text-silver dark:bg-transparent px-3 outline-none"
+                value={docName}
+                onChange={(e) => setDocName(e.target.value)}
+              ></input>
+              <div className="relative bottom-12 left-2 mt-[-18.39px]">
+                <span className="bg-white px-2 text-xs text-gray-4000 dark:text-silver dark:bg-outer-space">Name</span>
+              </div>
+              <div {...getRootProps()}>
+                <span className="rounded-3xl border border-purple-30 dark:bg-purple-taupe px-4 py-2 font-medium text-purple-30 dark:text-silver hover:cursor-pointer">
+                  <input type="button" {...getInputProps()} />
+                  Choose Files
+                </span>
+              </div>
+              <p className="mb-0 italic text-xs text-gray-4000">
+                Please upload .pdf, .txt, .rst, .docx, .md, .zip limited to 25mb
+              </p>
+              <div className="mt-0">
+                <p className="mb-[14px] font-medium text-eerie-black dark:text-light-gray">Uploaded Files</p>
+                {files.map((file) => (
+                  <p key={file.name} className="text-gray-6000">
+                    {file.name}
+                  </p>
+                ))}
+                {files.length === 0 && <p className="text-gray-6000 dark:text-light-gray">None</p>}
+              </div>
+            </>
+          )
+        }
+        {
+          activeTab === 'remote' && (
+            <>
+              <DropdownUrlType onSelect={(value: urlOption) => setUrlType(value)} selectedOption={urlType} options={urlOptions} />
+              <input
+                placeholder='Enter name'
+                type="text"
+                className="h-10 w-full rounded-full border-2 border-silver dark:text-silver dark:bg-transparent px-3 outline-none"
+                value={urlName}
+                onChange={(e) => setUrlName(e.target.value)}
+              ></input>
+              <div className="relative bottom-12 left-2 mt-[-18.39px]">
+                <span className="bg-white px-2 text-xs text-silver dark:text-silver dark:bg-outer-space">Name</span>
+              </div>
+              <input
+                placeholder='URL Link'
+                type="text"
+                className="h-10 w-full rounded-full border-2 border-silver dark:text-silver dark:bg-transparent px-3 outline-none"
+                value={url}
+                onChange={(e) => setUrl(e.target.value)}
+              ></input>
+              <div className="relative bottom-12 left-2 mt-[-18.39px]">
+                <span className="bg-white px-2 text-xs text-silver dark:text-silver dark:bg-outer-space">Link</span>
+              </div>
+            </>
+          )
+        }
         <div className="flex flex-row-reverse">
           <button
             onClick={uploadFile}
-            className={`ml-6 rounded-3xl bg-purple-30 text-white ${
-              files.length > 0 && docName.trim().length > 0
-                ? ''
-                : 'bg-opacity-75 text-opacity-80'
-            } py-2 px-6`}
+            className={`ml-6 rounded-3xl bg-purple-30 text-white cursor-pointer ${files.length > 0 && docName.trim().length > 0
+              ? ''
+              : 'bg-opacity-75 text-opacity-80'
+              } py-2 px-6`}
             disabled={files.length === 0 || docName.trim().length === 0} // Disable the button if no file is selected or docName is empty
           >
             Train
@@ -221,7 +324,7 @@ export default function Upload({
               setfiles([]);
               setModalState('INACTIVE');
             }}
-            className="font-medium dark:text-light-gray"
+            className="font-medium dark:text-light-gray cursor-pointer"
           >
             Cancel
           </button>
@@ -232,9 +335,8 @@ export default function Upload({
 
   return (
     <article
-      className={`${
-        modalState === 'ACTIVE' ? 'visible' : 'hidden'
-      } absolute z-30  h-screen w-screen  bg-gray-alpha`}
+      className={`${modalState === 'ACTIVE' ? 'visible' : 'hidden'
+        } absolute z-30  h-screen w-screen  bg-gray-alpha`}
     >
       <article className="mx-auto mt-24 flex w-[90vw] max-w-lg  flex-col gap-4 rounded-lg bg-white dark:bg-outer-space p-6 shadow-lg">
         {view}
diff --git a/frontend/tailwind.config.cjs b/frontend/tailwind.config.cjs
index 5946c5a3..50af33c8 100644
--- a/frontend/tailwind.config.cjs
+++ b/frontend/tailwind.config.cjs
@@ -43,7 +43,10 @@ module.exports = {
         'dark-charcoal':'#2F3036',
         'bright-gray':'#ECECF1',
         'outer-space':'#444654',
-        'gun-metal':'#2E303E'
+        'gun-metal':'#2E303E',
+        'sonic-silver':'#747474',
+        'soap':'#D8CCF1',
+        'independence':'#54546D'
       },
     },
   },

From 030c2a740ff5c4dc034ab041c73218162d90eefb Mon Sep 17 00:00:00 2001
From: Pavel <pabin@yandex.ru>
Date: Tue, 13 Feb 2024 23:41:36 +0300
Subject: [PATCH 12/17] upload_remote class

---
 application/api/user/routes.py | 28 +++++++++++++++++++++++++++-
 application/api/user/tasks.py  |  7 ++++++-
 application/worker.py          |  8 ++++----
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/application/api/user/routes.py b/application/api/user/routes.py
index 86742572..592a82cd 100644
--- a/application/api/user/routes.py
+++ b/application/api/user/routes.py
@@ -5,7 +5,7 @@ from pymongo import MongoClient
 from bson.objectid import ObjectId
 from werkzeug.utils import secure_filename
 
-from application.api.user.tasks import ingest
+from application.api.user.tasks import ingest, ingest_remote
 
 from application.core.settings import settings
 from application.vectorstore.vector_creator import VectorCreator
@@ -157,6 +157,32 @@ def upload_file():
         return {"status": "ok", "task_id": task_id}
     else:
         return {"status": "error"}
+    
+@user.route("/api/remote", methods=["POST"])
+def upload_remote():
+    """Upload a remote source to get vectorized and indexed."""
+    if "user" not in request.form:
+        return {"status": "no user"}
+    user = secure_filename(request.form["user"])
+    if "source" not in request.form:
+        return {"status": "no source"}
+    source = secure_filename(request.form["source"])
+    if "name" not in request.form:
+        return {"status": "no name"}
+    job_name = secure_filename(request.form["name"])
+    # check if the post request has the file part
+    if "data" not in request.form:
+        print("No data")
+        return {"status": "no data"}
+    source_data = request.form["data"]
+
+    if source_data:
+        task = ingest_remote.delay(source_data=source_data, job_name=job_name, user=user, loader=source)
+        # task id
+        task_id = task.id
+        return {"status": "ok", "task_id": task_id}
+    else:
+        return {"status": "error"}
 
 @user.route("/api/task_status", methods=["GET"])
 def task_status():
diff --git a/application/api/user/tasks.py b/application/api/user/tasks.py
index a3474939..4602bf85 100644
--- a/application/api/user/tasks.py
+++ b/application/api/user/tasks.py
@@ -1,7 +1,12 @@
-from application.worker import ingest_worker
+from application.worker import ingest_worker, remote_worker
 from application.celery import celery
 
 @celery.task(bind=True)
 def ingest(self, directory, formats, name_job, filename, user):
     resp = ingest_worker(self, directory, formats, name_job, filename, user)
     return resp
+
+@celery.task(bind=True)
+def ingest_remote(self, source_data, job_name, user, loader):
+    resp = remote_worker(self, source_data, job_name, user, loader)
+    return resp
diff --git a/application/worker.py b/application/worker.py
index 5fc28749..50344a26 100644
--- a/application/worker.py
+++ b/application/worker.py
@@ -123,7 +123,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
         'limited': False
     }
 
-def remote_worker(self, inputs, name_job, user, directory = 'temp', loader = 'url'):
+def remote_worker(self, source_data, name_job, user, directory = 'temp', loader = 'url'):
     sample = False
     token_check = True
     min_tokens = 150
@@ -135,10 +135,10 @@ def remote_worker(self, inputs, name_job, user, directory = 'temp', loader = 'ur
 
     self.update_state(state='PROGRESS', meta={'current': 1})
  
-    # inputs {"data": [url]} for url type task just urls
+    # source_data {"data": [url]} for url type task just urls
  
     # Use RemoteCreator to load data from URL
-    remote_loader = RemoteCreator.create_loader(loader, inputs)
+    remote_loader = RemoteCreator.create_loader(loader, source_data)
     raw_docs = remote_loader.load_data()
 
     raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)
@@ -165,7 +165,7 @@ def remote_worker(self, inputs, name_job, user, directory = 'temp', loader = 'ur
     shutil.rmtree(full_path)
 
     return {
-        'urls': inputs['data'],
+        'urls': source_data['data'],
         'name_job': name_job,
         'user': user,
         'limited': False

From 0cb3d12d94685535c9cba3af7436244805e4abe8 Mon Sep 17 00:00:00 2001
From: Alex <a@tushynski.me>
Date: Wed, 14 Feb 2024 15:17:56 +0000
Subject: [PATCH 13/17] Refactor loader classes to accept inputs directly

---
 application/parser/remote/crawler_loader.py |  2 +-
 application/parser/remote/sitemap_loader.py |  2 +-
 application/parser/remote/web_loader.py     |  2 +-
 application/worker.py                       | 14 ++++++--------
 4 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/application/parser/remote/crawler_loader.py b/application/parser/remote/crawler_loader.py
index ee037e59..2a63f284 100644
--- a/application/parser/remote/crawler_loader.py
+++ b/application/parser/remote/crawler_loader.py
@@ -10,7 +10,7 @@ class CrawlerLoader(BaseRemote):
         self.limit = limit  # Set the limit for the number of pages to scrape
 
     def load_data(self, inputs):
-        url = inputs['data']
+        url = inputs
         # Check if the input is a list and if it is, use the first element
         if isinstance(url, list) and url:
             url = url[0]
diff --git a/application/parser/remote/sitemap_loader.py b/application/parser/remote/sitemap_loader.py
index 0748f104..6e9182c4 100644
--- a/application/parser/remote/sitemap_loader.py
+++ b/application/parser/remote/sitemap_loader.py
@@ -10,7 +10,7 @@ class SitemapLoader(BaseRemote):
         self.limit = limit  # Adding limit to control the number of URLs to process
 
     def load_data(self, inputs):
-        sitemap_url= inputs['data']
+        sitemap_url= inputs
         # Check if the input is a list and if it is, use the first element
         if isinstance(sitemap_url, list) and sitemap_url:
             url = sitemap_url[0]
diff --git a/application/parser/remote/web_loader.py b/application/parser/remote/web_loader.py
index e5cd2e2f..9fc50c1c 100644
--- a/application/parser/remote/web_loader.py
+++ b/application/parser/remote/web_loader.py
@@ -6,7 +6,7 @@ class WebLoader(BaseRemote):
         self.loader = WebBaseLoader
 
     def load_data(self, inputs):
-        urls = inputs['data']
+        urls = inputs
 
         if isinstance(urls, str):
             urls = [urls] # Convert string to list if a single URL is passed
diff --git a/application/worker.py b/application/worker.py
index 50344a26..875611bf 100644
--- a/application/worker.py
+++ b/application/worker.py
@@ -138,19 +138,17 @@ def remote_worker(self, source_data, name_job, user, directory = 'temp', loader
     # source_data {"data": [url]} for url type task just urls
  
     # Use RemoteCreator to load data from URL
-    remote_loader = RemoteCreator.create_loader(loader, source_data)
-    raw_docs = remote_loader.load_data()
+    remote_loader = RemoteCreator.create_loader(loader)
+    raw_docs = remote_loader.load_data(source_data)
 
-    raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)
+    docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)
 
-    docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
+    #docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
 
     call_openai_api(docs, full_path, self)
     self.update_state(state='PROGRESS', meta={'current': 100})
+    
 
-    if sample:
-        for i in range(min(5, len(raw_docs))):
-            print(raw_docs[i].text)
 
     # Proceed with uploading and cleaning as in the original function
     file_data = {'name': name_job, 'user': user}
@@ -165,7 +163,7 @@ def remote_worker(self, source_data, name_job, user, directory = 'temp', loader
     shutil.rmtree(full_path)
 
     return {
-        'urls': source_data['data'],
+        'urls': source_data,
         'name_job': name_job,
         'user': user,
         'limited': False

From c4c0516820e7244d9b89ba26ba91c19ded343bda Mon Sep 17 00:00:00 2001
From: Alex <a@tushynski.me>
Date: Mon, 26 Feb 2024 14:31:54 +0000
Subject: [PATCH 14/17] add endpoint

---
 frontend/src/upload/Upload.tsx | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx
index 0e45e8c5..e3ccad06 100644
--- a/frontend/src/upload/Upload.tsx
+++ b/frontend/src/upload/Upload.tsx
@@ -209,6 +209,29 @@ export default function Upload({
     xhr.send(formData);
   };
 
+  const uploadRemote = () => {
+    console.log("here")
+    const formData = new FormData();
+    formData.append('name', urlName);
+    formData.append('user', 'local');
+    if (urlType !== null) {
+      formData.append('source', urlType?.value);
+    }
+    formData.append('data', url);
+    const apiHost = import.meta.env.VITE_API_HOST;
+    const xhr = new XMLHttpRequest();
+    xhr.upload.addEventListener('progress', (event) => {
+      const progress = +((event.loaded / event.total) * 100).toFixed(2);
+      setProgress({ type: 'UPLOAD', percentage: progress });
+    });
+    xhr.onload = () => {
+      const { task_id } = JSON.parse(xhr.responseText);
+      setProgress({ type: 'TRAINIING', percentage: 0, taskId: task_id });
+    };
+    xhr.open('POST', `${apiHost + '/api/remote'}`);
+    xhr.send(formData);
+  };
+
   const { getRootProps, getInputProps, isDragActive } = useDropzone({
     onDrop,
     multiple: false,
@@ -309,12 +332,12 @@ export default function Upload({
         }
         <div className="flex flex-row-reverse">
           <button
-            onClick={uploadFile}
+            onClick={activeTab === 'file' ? uploadFile : uploadRemote} 
             className={`ml-6 rounded-3xl bg-purple-30 text-white cursor-pointer ${files.length > 0 && docName.trim().length > 0
               ? ''
               : 'bg-opacity-75 text-opacity-80'
               } py-2 px-6`}
-            disabled={files.length === 0 || docName.trim().length === 0} // Disable the button if no file is selected or docName is empty
+            disabled={(files.length === 0 || docName.trim().length === 0) && (activeTab === 'file') } // Disable the button if no file is selected or docName is empty
           >
             Train
           </button>

From 325a8889ab106bda471f08a4572dd70ccfcc3fed Mon Sep 17 00:00:00 2001
From: Alex <a@tushynski.me>
Date: Tue, 27 Feb 2024 11:52:51 +0000
Subject: [PATCH 15/17] update url

---
 frontend/src/upload/Upload.tsx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx
index e3ccad06..d0d5bf6a 100644
--- a/frontend/src/upload/Upload.tsx
+++ b/frontend/src/upload/Upload.tsx
@@ -70,9 +70,9 @@ export default function Upload({
   const [urlName, setUrlName] = useState('')
   const [url, setUrl] = useState('')
   const urlOptions: urlOption[] = [
-    { label: 'Github', value: 'github' },
-    { label: 'Sitemap', value: 'Sitemap' },
-    { label: 'Link', value: 'link' }]
+    { label: 'Crawler', value: 'crawler' },
+    { label: 'Sitemap', value: 'sitemap' },
+    { label: 'Link', value: 'url' }]
   const [urlType, setUrlType] = useState<urlOption>(null)
   const [activeTab, setActiveTab] = useState<string>('file');
   const [files, setfiles] = useState<File[]>([]);

From 54d187a0ade1f2f82d66067b86107114db8eaee8 Mon Sep 17 00:00:00 2001
From: Pavel <pabin@yandex.ru>
Date: Wed, 28 Feb 2024 19:52:58 +0300
Subject: [PATCH 16/17] Fixing ingestion metadata grouping

---
 .gitignore                            | 1 +
 application/parser/remote/base.py     | 2 +-
 application/parser/remote/telegram.py | 4 ++--
 application/worker.py                 | 8 ++++----
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index 053e5793..1a5f0419 100644
--- a/.gitignore
+++ b/.gitignore
@@ -172,3 +172,4 @@ application/vectors/
 node_modules/
 .vscode/settings.json
 models/
+model/
\ No newline at end of file
diff --git a/application/parser/remote/base.py b/application/parser/remote/base.py
index 75ae34d5..91313f22 100644
--- a/application/parser/remote/base.py
+++ b/application/parser/remote/base.py
@@ -1,6 +1,6 @@
 """Base reader class."""
 from abc import abstractmethod
-from typing import Any, List, Iterator
+from typing import Any, List
 
 from langchain.docstore.document import Document as LCDocument
 from application.parser.schema.base import Document
diff --git a/application/parser/remote/telegram.py b/application/parser/remote/telegram.py
index 895d5cb3..0e691be4 100644
--- a/application/parser/remote/telegram.py
+++ b/application/parser/remote/telegram.py
@@ -1,4 +1,4 @@
-from langchain.document_loader import TelegramChatApiLoader, TelegramChatFileLoader
+from langchain.document_loader import TelegramChatApiLoader
 from application.parser.remote.base import BaseRemote
 
 class TelegramChatApiRemote(BaseRemote):
@@ -8,4 +8,4 @@ class TelegramChatApiRemote(BaseRemote):
 
     def parse_file(self, *args, **load_kwargs):
 
-        return text
\ No newline at end of file
+        return 
\ No newline at end of file
diff --git a/application/worker.py b/application/worker.py
index 875611bf..21bb319f 100644
--- a/application/worker.py
+++ b/application/worker.py
@@ -124,7 +124,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
     }
 
 def remote_worker(self, source_data, name_job, user, directory = 'temp', loader = 'url'):
-    sample = False
+    # sample = False
     token_check = True
     min_tokens = 150
     max_tokens = 1250
@@ -155,10 +155,10 @@ def remote_worker(self, source_data, name_job, user, directory = 'temp', loader
     if settings.VECTOR_STORE == "faiss":
         files = {'file_faiss': open(full_path + '/index.faiss', 'rb'),
                  'file_pkl': open(full_path + '/index.pkl', 'rb')}
-        response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data)
-        response = requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path))
+        requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data)
+        requests.get(urljoin(settings.API_URL, "/api/delete_old?path=" + full_path))
     else:
-        response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data)
+        requests.post(urljoin(settings.API_URL, "/api/upload_index"), data=file_data)
 
     shutil.rmtree(full_path)
 

From f4288f0bd4abc949b2a2c5ede16f1d770ba33216 Mon Sep 17 00:00:00 2001
From: Alex <a@tushynski.me>
Date: Fri, 1 Mar 2024 14:41:03 +0000
Subject: [PATCH 17/17] remove sitemap

---
 frontend/src/upload/Upload.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx
index d0d5bf6a..c9c5cde4 100644
--- a/frontend/src/upload/Upload.tsx
+++ b/frontend/src/upload/Upload.tsx
@@ -71,7 +71,7 @@ export default function Upload({
   const [url, setUrl] = useState('')
   const urlOptions: urlOption[] = [
     { label: 'Crawler', value: 'crawler' },
-    { label: 'Sitemap', value: 'sitemap' },
+    // { label: 'Sitemap', value: 'sitemap' },
     { label: 'Link', value: 'url' }]
   const [urlType, setUrlType] = useState<urlOption>(null)
   const [activeTab, setActiveTab] = useState<string>('file');