From d0b472ad3893b41d190f6937676ef99c6e985953 Mon Sep 17 00:00:00 2001
From: Manan <manan19jain@gmail.com>
Date: Sun, 19 Feb 2023 01:53:16 +0530
Subject: [PATCH 1/4] Implemented html_parser: cleaning & chunk creation

---
 scripts/parser/file/bulk.py        |  2 +
 scripts/parser/file/html_parser.py | 73 ++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 scripts/parser/file/html_parser.py

diff --git a/scripts/parser/file/bulk.py b/scripts/parser/file/bulk.py
index 7808186..4fdea6f 100644
--- a/scripts/parser/file/bulk.py
+++ b/scripts/parser/file/bulk.py
@@ -7,6 +7,7 @@ from parser.file.base import BaseReader
 from parser.file.base_parser import BaseParser
 from parser.file.docs_parser import DocxParser, PDFParser
 from parser.file.epub_parser import EpubParser
+from parser.file.html_parser import HTMLParser
 from parser.file.markdown_parser import MarkdownParser
 from parser.file.rst_parser import RstParser
 from parser.file.tabular_parser import PandasCSVParser
@@ -19,6 +20,7 @@ DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
     ".epub": EpubParser(),
     ".md": MarkdownParser(),
     ".rst": RstParser(),
+    ".html": HTMLParser(),
 }
 
 
diff --git a/scripts/parser/file/html_parser.py b/scripts/parser/file/html_parser.py
new file mode 100644
index 0000000..c941dd7
--- /dev/null
+++ b/scripts/parser/file/html_parser.py
@@ -0,0 +1,73 @@
+"""HTML parser.
+
+Contains parser for html files.
+
+"""
+import re
+from pathlib import Path
+from typing import Dict, Union
+
+from parser.file.base_parser import BaseParser
+
+class HTMLParser(BaseParser):
+    """HTML parser."""
+
+    def _init_parser(self) -> Dict:
+        """Init parser."""
+        return {}
+
+    def parse_file(self, file: Path, errors: str = "ignore") -> str:
+        """Parse file."""
+        try:
+            import unstructured
+        except ImportError:
+            raise ValueError("unstructured package is required to parse HTML files.")
+        from unstructured.partition.html import partition_html
+        from unstructured.staging.base import convert_to_isd
+        from unstructured.cleaners.core import clean
+
+        with open(file, "r", encoding="utf-8") as fp:
+            elements = partition_html(file=fp)
+            isd = convert_to_isd(elements)
+
+            # Removing non ascii charactwers from isd_el['text']
+            for isd_el in isd:
+                isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
+
+            # Removing all the \n characters from isd_el['text'] using regex and replace with single space
+            # Removing all the extra spaces  from isd_el['text'] using regex and replace with single space
+            for isd_el in isd:
+                isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL)
+                isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL)
+
+            # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
+            for isd_el in isd:
+                clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )
+
+            # Creating a list of all the indexes of isd_el['type'] = 'Title'
+            title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title']
+
+            # Creating 'Chunks' - List of lists of strings 
+            # each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
+            # Each Chunk can be thought of as an individual set of data, which can be sent to the model
+
+            Chunks = list(list())
+
+            for i,isd_el in enumerate(isd):
+                if i in title_indexes:
+                    Chunks.append([])
+                Chunks[-1].append(isd_el['text'])
+
+            print(Chunks)
+
+            # writing the chunks to a file
+            # with open('chunks.txt', 'w') as f:
+                # for chunk in Chunks:
+                    # f.write("%s \n" % chunk)
+
+
+        # # convert to isd ;Format : {'text': 'Navigation', 'type': 'Title'}         
+        # with open(file, "r", encoding="utf-8") as fp:
+        #     elements = partition_html(file=fp)
+        #     isd = convert_to_isd(elements)
+        #     print(isd)
\ No newline at end of file

From 4f5beaaa9e29cf14153068c02539cf9bba40a78c Mon Sep 17 00:00:00 2001
From: Manan <manan19jain@gmail.com>
Date: Sun, 19 Feb 2023 01:54:00 +0530
Subject: [PATCH 2/4] require package unstructued for html_parser

---
 application/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/application/requirements.txt b/application/requirements.txt
index 9e8f73b..8531ab4 100644
--- a/application/requirements.txt
+++ b/application/requirements.txt
@@ -62,6 +62,7 @@ tqdm==4.64.1
 transformers==4.26.0
 typing-inspect==0.8.0
 typing_extensions==4.4.0
+unstructured==0.4.8
 urllib3==1.26.14
 Werkzeug==2.2.2
 XlsxWriter==3.0.8

From 16eb503e3634dbac6422209795803972fa9e635d Mon Sep 17 00:00:00 2001
From: Manan <manan19jain@gmail.com>
Date: Tue, 21 Feb 2023 23:06:00 +0530
Subject: [PATCH 3/4] Added HTML Support. read, clean-up, filter return

---
 scripts/ingest.py                  |  2 +-
 scripts/parser/file/html_parser.py | 99 ++++++++++++++++--------------
 2 files changed, 55 insertions(+), 46 deletions(-)

diff --git a/scripts/ingest.py b/scripts/ingest.py
index 2194a7c..f41b86e 100644
--- a/scripts/ingest.py
+++ b/scripts/ingest.py
@@ -37,7 +37,7 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
                                                    help="Maximum number of files to read."),
            formats: Optional[List[str]] = typer.Option([".rst", ".md"],
                                                    help="""List of required extensions (list with .)
-                                                        Currently supported: .rst, .md, .pdf, .docx, .csv, .epub"""),
+                                                        Currently supported: .rst, .md, .pdf, .docx, .csv, .epub, .html"""),
            exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles).")):
 
     """
diff --git a/scripts/parser/file/html_parser.py b/scripts/parser/file/html_parser.py
index c941dd7..12c01ae 100644
--- a/scripts/parser/file/html_parser.py
+++ b/scripts/parser/file/html_parser.py
@@ -16,8 +16,12 @@ class HTMLParser(BaseParser):
         """Init parser."""
         return {}
 
-    def parse_file(self, file: Path, errors: str = "ignore") -> str:
-        """Parse file."""
+    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]:
+        """Parse file.
+
+            Returns:
+            Union[str, List[str]]: a string or a List of strings.
+        """
         try:
             import unstructured
         except ImportError:
@@ -26,48 +30,53 @@ class HTMLParser(BaseParser):
         from unstructured.staging.base import convert_to_isd
         from unstructured.cleaners.core import clean
 
+        # Using the unstructured library to convert the html to isd format
+        # isd sample : isd = [
+                            #   {"text": "My Title", "type": "Title"},
+                            #   {"text": "My Narrative", "type": "NarrativeText"}
+                            # ]
         with open(file, "r", encoding="utf-8") as fp:
             elements = partition_html(file=fp)
-            isd = convert_to_isd(elements)
-
-            # Removing non ascii charactwers from isd_el['text']
-            for isd_el in isd:
-                isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
-
-            # Removing all the \n characters from isd_el['text'] using regex and replace with single space
-            # Removing all the extra spaces  from isd_el['text'] using regex and replace with single space
-            for isd_el in isd:
-                isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL)
-                isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL)
-
-            # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
-            for isd_el in isd:
-                clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )
-
-            # Creating a list of all the indexes of isd_el['type'] = 'Title'
-            title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title']
-
-            # Creating 'Chunks' - List of lists of strings 
-            # each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
-            # Each Chunk can be thought of as an individual set of data, which can be sent to the model
-
-            Chunks = list(list())
-
-            for i,isd_el in enumerate(isd):
-                if i in title_indexes:
-                    Chunks.append([])
-                Chunks[-1].append(isd_el['text'])
-
-            print(Chunks)
-
-            # writing the chunks to a file
-            # with open('chunks.txt', 'w') as f:
-                # for chunk in Chunks:
-                    # f.write("%s \n" % chunk)
-
-
-        # # convert to isd ;Format : {'text': 'Navigation', 'type': 'Title'}         
-        # with open(file, "r", encoding="utf-8") as fp:
-        #     elements = partition_html(file=fp)
-        #     isd = convert_to_isd(elements)
-        #     print(isd)
\ No newline at end of file
+            isd = convert_to_isd(elements)  
+
+        # Removing non ascii charactwers from isd_el['text']
+        for isd_el in isd:
+            isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
+
+        # Removing all the \n characters from isd_el['text'] using regex and replace with single space
+        # Removing all the extra spaces  from isd_el['text'] using regex and replace with single space
+        for isd_el in isd:
+            isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL)
+            isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL)
+
+        # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
+        for isd_el in isd:
+            clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )
+
+        # Creating a list of all the indexes of isd_el['type'] = 'Title'
+        title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title']
+
+        # Creating 'Chunks' - List of lists of strings 
+        # each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
+        # Each Chunk can be thought of as an individual set of data, which can be sent to the model
+        # Where Each Title is grouped together with the data under it
+
+        Chunks = list(list())
+        final_chunks = list(list())
+
+        for i,isd_el in enumerate(isd):
+            if i in title_indexes:
+                Chunks.append([])
+            Chunks[-1].append(isd_el['text'])
+
+        # Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
+        for chunk in Chunks:
+            # sum of lenth of all the strings in the chunk
+            sum = 0
+            sum += len(str(chunk))
+            if sum < 25:
+                Chunks.remove(chunk)
+            else :         
+                # appending all the approved chunks to final_chunks as a single string       
+                final_chunks.append(" ".join([str(item) for item in chunk]))
+        return final_chunks

From 524e0f6f01a72824ca212a6d552460065cb89468 Mon Sep 17 00:00:00 2001
From: Manan <manan19jain@gmail.com>
Date: Wed, 22 Feb 2023 20:20:54 +0530
Subject: [PATCH 4/4] fix | Chunk creation error when title not the first
 element in HTML

---
 scripts/parser/file/html_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/parser/file/html_parser.py b/scripts/parser/file/html_parser.py
index 12c01ae..53d7492 100644
--- a/scripts/parser/file/html_parser.py
+++ b/scripts/parser/file/html_parser.py
@@ -61,7 +61,7 @@ class HTMLParser(BaseParser):
         # Each Chunk can be thought of as an individual set of data, which can be sent to the model
         # Where Each Title is grouped together with the data under it
 
-        Chunks = list(list())
+        Chunks = [[]]
         final_chunks = list(list())
 
         for i,isd_el in enumerate(isd):