mirror of https://github.com/arc53/DocsGPT
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
73 lines
2.8 KiB
Python
73 lines
2.8 KiB
Python
2 years ago
|
"""HTML parser.
|
||
|
|
||
|
Contains parser for html files.
|
||
|
|
||
|
"""
|
||
|
import re
|
||
|
from pathlib import Path
|
||
|
from typing import Dict, Union
|
||
|
|
||
|
from parser.file.base_parser import BaseParser
|
||
|
|
||
|
class HTMLParser(BaseParser):
|
||
|
"""HTML parser."""
|
||
|
|
||
|
def _init_parser(self) -> Dict:
|
||
|
"""Init parser."""
|
||
|
return {}
|
||
|
|
||
|
def parse_file(self, file: Path, errors: str = "ignore") -> str:
|
||
|
"""Parse file."""
|
||
|
try:
|
||
|
import unstructured
|
||
|
except ImportError:
|
||
|
raise ValueError("unstructured package is required to parse HTML files.")
|
||
|
from unstructured.partition.html import partition_html
|
||
|
from unstructured.staging.base import convert_to_isd
|
||
|
from unstructured.cleaners.core import clean
|
||
|
|
||
|
with open(file, "r", encoding="utf-8") as fp:
|
||
|
elements = partition_html(file=fp)
|
||
|
isd = convert_to_isd(elements)
|
||
|
|
||
|
# Removing non ascii charactwers from isd_el['text']
|
||
|
for isd_el in isd:
|
||
|
isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
|
||
|
|
||
|
# Removing all the \n characters from isd_el['text'] using regex and replace with single space
|
||
|
# Removing all the extra spaces from isd_el['text'] using regex and replace with single space
|
||
|
for isd_el in isd:
|
||
|
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL)
|
||
|
isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL)
|
||
|
|
||
|
# more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
|
||
|
for isd_el in isd:
|
||
|
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )
|
||
|
|
||
|
# Creating a list of all the indexes of isd_el['type'] = 'Title'
|
||
|
title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title']
|
||
|
|
||
|
# Creating 'Chunks' - List of lists of strings
|
||
|
# each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
|
||
|
# Each Chunk can be thought of as an individual set of data, which can be sent to the model
|
||
|
|
||
|
Chunks = list(list())
|
||
|
|
||
|
for i,isd_el in enumerate(isd):
|
||
|
if i in title_indexes:
|
||
|
Chunks.append([])
|
||
|
Chunks[-1].append(isd_el['text'])
|
||
|
|
||
|
print(Chunks)
|
||
|
|
||
|
# writing the chunks to a file
|
||
|
# with open('chunks.txt', 'w') as f:
|
||
|
# for chunk in Chunks:
|
||
|
# f.write("%s \n" % chunk)
|
||
|
|
||
|
|
||
|
# # convert to isd ;Format : {'text': 'Navigation', 'type': 'Title'}
|
||
|
# with open(file, "r", encoding="utf-8") as fp:
|
||
|
# elements = partition_html(file=fp)
|
||
|
# isd = convert_to_isd(elements)
|
||
|
# print(isd)
|