DocsGPT/application/parser/file/html_parser.py

"""HTML parser.

Contains parser for html files.

"""
import re
from pathlib import Path
from typing import Dict, Union

from application.parser.file.base_parser import BaseParser


class HTMLParser(BaseParser):
    """HTML parser."""

    def _init_parser(self) -> Dict:
        """Init parser."""
        return {}

    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]:
        """Parse file.

            Returns:
            Union[str, List[str]]: a string or a List of strings.
        """
        try:
            from unstructured.partition.html import partition_html
            from unstructured.staging.base import convert_to_isd
            from unstructured.cleaners.core import clean
        except ImportError:
            raise ValueError("unstructured package is required to parse HTML files.")

        # Using the unstructured library to convert the html to isd format
        # isd sample : isd = [
        #   {"text": "My Title", "type": "Title"},
        #   {"text": "My Narrative", "type": "NarrativeText"}
        # ]
        with open(file, "r", encoding="utf-8") as fp:
            elements = partition_html(file=fp)
            isd = convert_to_isd(elements)

            # Removing non ascii charactwers from isd_el['text']
        for isd_el in isd:
            isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()

        # Removing all the \n characters from isd_el['text'] using regex and replace with single space
        # Removing all the extra spaces  from isd_el['text'] using regex and replace with single space
        for isd_el in isd:
            isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
            isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)

        # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
        for isd_el in isd:
            clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)

        # Creating a list of all the indexes of isd_el['type'] = 'Title'
        title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']

        # Creating 'Chunks' - List of lists of strings 
        # each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
        # Each Chunk can be thought of as an individual set of data, which can be sent to the model
        # Where Each Title is grouped together with the data under it

        Chunks = [[]]
        final_chunks = list(list())

        for i, isd_el in enumerate(isd):
            if i in title_indexes:
                Chunks.append([])
            Chunks[-1].append(isd_el['text'])

        # Removing all the chunks with sum of length of all the strings in the chunk < 25
        # TODO: This value can be an user defined variable
        for chunk in Chunks:
            # sum of length of all the strings in the chunk
            sum = 0
            sum += len(str(chunk))
            if sum < 25:
                Chunks.remove(chunk)
            else:
                # appending all the approved chunks to final_chunks as a single string       
                final_chunks.append(" ".join([str(item) for item in chunk]))
        return final_chunks
uploads backend first 2023-03-13 14:20:03 +00:00			`"""HTML parser.`

			`Contains parser for html files.`

			`"""`
			`import re`
			`from pathlib import Path`
			`from typing import Dict, Union`

fix packaging and imports and introduce tests with pytest. still issues with celery worker. 2023-08-13 17:25:55 +00:00			`from application.parser.file.base_parser import BaseParser`
uploads backend first 2023-03-13 14:20:03 +00:00
Linting * validate python formatting on every build with Ruff * fix lint warnings 2023-05-13 08:36:17 +00:00
uploads backend first 2023-03-13 14:20:03 +00:00			`class HTMLParser(BaseParser):`
			`"""HTML parser."""`

			`def _init_parser(self) -> Dict:`
			`"""Init parser."""`
			`return {}`

			`def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]:`
			`"""Parse file.`

			`Returns:`
			`Union[str, List[str]]: a string or a List of strings.`
			`"""`
			`try:`
Linting * validate python formatting on every build with Ruff * fix lint warnings 2023-05-13 08:36:17 +00:00			`from unstructured.partition.html import partition_html`
			`from unstructured.staging.base import convert_to_isd`
			`from unstructured.cleaners.core import clean`
uploads backend first 2023-03-13 14:20:03 +00:00			`except ImportError:`
			`raise ValueError("unstructured package is required to parse HTML files.")`

			`# Using the unstructured library to convert the html to isd format`
			`# isd sample : isd = [`
Linting * validate python formatting on every build with Ruff * fix lint warnings 2023-05-13 08:36:17 +00:00			`# {"text": "My Title", "type": "Title"},`
			`# {"text": "My Narrative", "type": "NarrativeText"}`
			`# ]`
uploads backend first 2023-03-13 14:20:03 +00:00			`with open(file, "r", encoding="utf-8") as fp:`
			`elements = partition_html(file=fp)`
Linting * validate python formatting on every build with Ruff * fix lint warnings 2023-05-13 08:36:17 +00:00			`isd = convert_to_isd(elements)`
uploads backend first 2023-03-13 14:20:03 +00:00
Linting * validate python formatting on every build with Ruff * fix lint warnings 2023-05-13 08:36:17 +00:00			`# Removing non ascii charactwers from isd_el['text']`
uploads backend first 2023-03-13 14:20:03 +00:00			`for isd_el in isd:`
			`isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()`

			`# Removing all the \n characters from isd_el['text'] using regex and replace with single space`
			`# Removing all the extra spaces from isd_el['text'] using regex and replace with single space`
			`for isd_el in isd:`
Linting * validate python formatting on every build with Ruff * fix lint warnings 2023-05-13 08:36:17 +00:00			`isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE \| re.DOTALL)`
			`isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE \| re.DOTALL)`
uploads backend first 2023-03-13 14:20:03 +00:00
			`# more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation`
			`for isd_el in isd:`
Linting * validate python formatting on every build with Ruff * fix lint warnings 2023-05-13 08:36:17 +00:00			`clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)`
uploads backend first 2023-03-13 14:20:03 +00:00
			`# Creating a list of all the indexes of isd_el['type'] = 'Title'`
Linting * validate python formatting on every build with Ruff * fix lint warnings 2023-05-13 08:36:17 +00:00			`title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']`
uploads backend first 2023-03-13 14:20:03 +00:00
			`# Creating 'Chunks' - List of lists of strings`
			`# each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'`
			`# Each Chunk can be thought of as an individual set of data, which can be sent to the model`
			`# Where Each Title is grouped together with the data under it`

			`Chunks = [[]]`
			`final_chunks = list(list())`

Linting * validate python formatting on every build with Ruff * fix lint warnings 2023-05-13 08:36:17 +00:00			`for i, isd_el in enumerate(isd):`
uploads backend first 2023-03-13 14:20:03 +00:00			`if i in title_indexes:`
			`Chunks.append([])`
			`Chunks[-1].append(isd_el['text'])`

Fix spelling 2023-10-01 15:25:23 +00:00			`# Removing all the chunks with sum of length of all the strings in the chunk < 25`
Linting * validate python formatting on every build with Ruff * fix lint warnings 2023-05-13 08:36:17 +00:00			`# TODO: This value can be an user defined variable`
uploads backend first 2023-03-13 14:20:03 +00:00			`for chunk in Chunks:`
Fix spelling 2023-10-01 15:25:23 +00:00			`# sum of length of all the strings in the chunk`
uploads backend first 2023-03-13 14:20:03 +00:00			`sum = 0`
			`sum += len(str(chunk))`
			`if sum < 25:`
			`Chunks.remove(chunk)`
Linting * validate python formatting on every build with Ruff * fix lint warnings 2023-05-13 08:36:17 +00:00			`else:`
uploads backend first 2023-03-13 14:20:03 +00:00			`# appending all the approved chunks to final_chunks as a single string`
			`final_chunks.append(" ".join([str(item) for item in chunk]))`
			`return final_chunks`