From d0b472ad3893b41d190f6937676ef99c6e985953 Mon Sep 17 00:00:00 2001 From: Manan Date: Sun, 19 Feb 2023 01:53:16 +0530 Subject: [PATCH 1/4] Implemented html_parser: cleaning & chunk creation --- scripts/parser/file/bulk.py | 2 + scripts/parser/file/html_parser.py | 73 ++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 scripts/parser/file/html_parser.py diff --git a/scripts/parser/file/bulk.py b/scripts/parser/file/bulk.py index 7808186..4fdea6f 100644 --- a/scripts/parser/file/bulk.py +++ b/scripts/parser/file/bulk.py @@ -7,6 +7,7 @@ from parser.file.base import BaseReader from parser.file.base_parser import BaseParser from parser.file.docs_parser import DocxParser, PDFParser from parser.file.epub_parser import EpubParser +from parser.file.html_parser import HTMLParser from parser.file.markdown_parser import MarkdownParser from parser.file.rst_parser import RstParser from parser.file.tabular_parser import PandasCSVParser @@ -19,6 +20,7 @@ DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = { ".epub": EpubParser(), ".md": MarkdownParser(), ".rst": RstParser(), + ".html": HTMLParser(), } diff --git a/scripts/parser/file/html_parser.py b/scripts/parser/file/html_parser.py new file mode 100644 index 0000000..c941dd7 --- /dev/null +++ b/scripts/parser/file/html_parser.py @@ -0,0 +1,73 @@ +"""HTML parser. + +Contains parser for html files. + +""" +import re +from pathlib import Path +from typing import Dict, Union + +from parser.file.base_parser import BaseParser + +class HTMLParser(BaseParser): + """HTML parser.""" + + def _init_parser(self) -> Dict: + """Init parser.""" + return {} + + def parse_file(self, file: Path, errors: str = "ignore") -> str: + """Parse file.""" + try: + import unstructured + except ImportError: + raise ValueError("unstructured package is required to parse HTML files.") + from unstructured.partition.html import partition_html + from unstructured.staging.base import convert_to_isd + from unstructured.cleaners.core import clean + + with open(file, "r", encoding="utf-8") as fp: + elements = partition_html(file=fp) + isd = convert_to_isd(elements) + + # Removing non ascii charactwers from isd_el['text'] + for isd_el in isd: + isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode() + + # Removing all the \n characters from isd_el['text'] using regex and replace with single space + # Removing all the extra spaces from isd_el['text'] using regex and replace with single space + for isd_el in isd: + isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL) + isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL) + + # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation + for isd_el in isd: + clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True ) + + # Creating a list of all the indexes of isd_el['type'] = 'Title' + title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title'] + + # Creating 'Chunks' - List of lists of strings + # each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title' + # Each Chunk can be thought of as an individual set of data, which can be sent to the model + + Chunks = list(list()) + + for i,isd_el in enumerate(isd): + if i in title_indexes: + Chunks.append([]) + Chunks[-1].append(isd_el['text']) + + print(Chunks) + + # writing the chunks to a file + # with open('chunks.txt', 'w') as f: + # for chunk in Chunks: + # f.write("%s \n" % chunk) + + + # # convert to isd ;Format : {'text': 'Navigation', 'type': 'Title'} + # with open(file, "r", encoding="utf-8") as fp: + # elements = partition_html(file=fp) + # isd = convert_to_isd(elements) + # print(isd) \ No newline at end of file From 4f5beaaa9e29cf14153068c02539cf9bba40a78c Mon Sep 17 00:00:00 2001 From: Manan Date: Sun, 19 Feb 2023 01:54:00 +0530 Subject: [PATCH 2/4] require package unstructued for html_parser --- application/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/application/requirements.txt b/application/requirements.txt index 9e8f73b..8531ab4 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -62,6 +62,7 @@ tqdm==4.64.1 transformers==4.26.0 typing-inspect==0.8.0 typing_extensions==4.4.0 +unstructured==0.4.8 urllib3==1.26.14 Werkzeug==2.2.2 XlsxWriter==3.0.8 From 16eb503e3634dbac6422209795803972fa9e635d Mon Sep 17 00:00:00 2001 From: Manan Date: Tue, 21 Feb 2023 23:06:00 +0530 Subject: [PATCH 3/4] Added HTML Support. read, clean-up, filter return --- scripts/ingest.py | 2 +- scripts/parser/file/html_parser.py | 99 ++++++++++++++++-------------- 2 files changed, 55 insertions(+), 46 deletions(-) diff --git a/scripts/ingest.py b/scripts/ingest.py index 2194a7c..f41b86e 100644 --- a/scripts/ingest.py +++ b/scripts/ingest.py @@ -37,7 +37,7 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False, help="Maximum number of files to read."), formats: Optional[List[str]] = typer.Option([".rst", ".md"], help="""List of required extensions (list with .) - Currently supported: .rst, .md, .pdf, .docx, .csv, .epub"""), + Currently supported: .rst, .md, .pdf, .docx, .csv, .epub, .html"""), exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles).")): """ diff --git a/scripts/parser/file/html_parser.py b/scripts/parser/file/html_parser.py index c941dd7..12c01ae 100644 --- a/scripts/parser/file/html_parser.py +++ b/scripts/parser/file/html_parser.py @@ -16,8 +16,12 @@ class HTMLParser(BaseParser): """Init parser.""" return {} - def parse_file(self, file: Path, errors: str = "ignore") -> str: - """Parse file.""" + def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]: + """Parse file. + + Returns: + Union[str, List[str]]: a string or a List of strings. + """ try: import unstructured except ImportError: @@ -26,48 +30,53 @@ class HTMLParser(BaseParser): from unstructured.staging.base import convert_to_isd from unstructured.cleaners.core import clean + # Using the unstructured library to convert the html to isd format + # isd sample : isd = [ + # {"text": "My Title", "type": "Title"}, + # {"text": "My Narrative", "type": "NarrativeText"} + # ] with open(file, "r", encoding="utf-8") as fp: elements = partition_html(file=fp) - isd = convert_to_isd(elements) - - # Removing non ascii charactwers from isd_el['text'] - for isd_el in isd: - isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode() - - # Removing all the \n characters from isd_el['text'] using regex and replace with single space - # Removing all the extra spaces from isd_el['text'] using regex and replace with single space - for isd_el in isd: - isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL) - isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL) - - # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation - for isd_el in isd: - clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True ) - - # Creating a list of all the indexes of isd_el['type'] = 'Title' - title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title'] - - # Creating 'Chunks' - List of lists of strings - # each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title' - # Each Chunk can be thought of as an individual set of data, which can be sent to the model - - Chunks = list(list()) - - for i,isd_el in enumerate(isd): - if i in title_indexes: - Chunks.append([]) - Chunks[-1].append(isd_el['text']) - - print(Chunks) - - # writing the chunks to a file - # with open('chunks.txt', 'w') as f: - # for chunk in Chunks: - # f.write("%s \n" % chunk) - - - # # convert to isd ;Format : {'text': 'Navigation', 'type': 'Title'} - # with open(file, "r", encoding="utf-8") as fp: - # elements = partition_html(file=fp) - # isd = convert_to_isd(elements) - # print(isd) \ No newline at end of file + isd = convert_to_isd(elements) + + # Removing non ascii charactwers from isd_el['text'] + for isd_el in isd: + isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode() + + # Removing all the \n characters from isd_el['text'] using regex and replace with single space + # Removing all the extra spaces from isd_el['text'] using regex and replace with single space + for isd_el in isd: + isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL) + isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL) + + # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation + for isd_el in isd: + clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True ) + + # Creating a list of all the indexes of isd_el['type'] = 'Title' + title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title'] + + # Creating 'Chunks' - List of lists of strings + # each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title' + # Each Chunk can be thought of as an individual set of data, which can be sent to the model + # Where Each Title is grouped together with the data under it + + Chunks = list(list()) + final_chunks = list(list()) + + for i,isd_el in enumerate(isd): + if i in title_indexes: + Chunks.append([]) + Chunks[-1].append(isd_el['text']) + + # Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable + for chunk in Chunks: + # sum of lenth of all the strings in the chunk + sum = 0 + sum += len(str(chunk)) + if sum < 25: + Chunks.remove(chunk) + else : + # appending all the approved chunks to final_chunks as a single string + final_chunks.append(" ".join([str(item) for item in chunk])) + return final_chunks From 524e0f6f01a72824ca212a6d552460065cb89468 Mon Sep 17 00:00:00 2001 From: Manan Date: Wed, 22 Feb 2023 20:20:54 +0530 Subject: [PATCH 4/4] fix | Chunk creation error when title not the first element in HTML --- scripts/parser/file/html_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/parser/file/html_parser.py b/scripts/parser/file/html_parser.py index 12c01ae..53d7492 100644 --- a/scripts/parser/file/html_parser.py +++ b/scripts/parser/file/html_parser.py @@ -61,7 +61,7 @@ class HTMLParser(BaseParser): # Each Chunk can be thought of as an individual set of data, which can be sent to the model # Where Each Title is grouped together with the data under it - Chunks = list(list()) + Chunks = [[]] final_chunks = list(list()) for i,isd_el in enumerate(isd):