DocsGPT/scripts/parser/file/epub_parser.py

"""Epub parser.

Contains parsers for epub files.
"""

from pathlib import Path
from typing import Dict

from parser.file.base_parser import BaseParser


class EpubParser(BaseParser):
    """Epub Parser."""

    def _init_parser(self) -> Dict:
        """Init parser."""
        return {}

    def parse_file(self, file: Path, errors: str = "ignore") -> str:
        """Parse file."""
        try:
            import ebooklib
            from ebooklib import epub
        except ImportError:
            raise ValueError("`EbookLib` is required to read Epub files.")
        try:
            import html2text
        except ImportError:
            raise ValueError("`html2text` is required to parse Epub files.")

        text_list = []
        book = epub.read_epub(file, options={"ignore_ncx": True})

        # Iterate through all chapters.
        for item in book.get_items():
            # Chapters are typically located in epub documents items.
            if item.get_type() == ebooklib.ITEM_DOCUMENT:
                text_list.append(
                    html2text.html2text(item.get_content().decode("utf-8"))
                )

        text = "\n".join(text_list)
        return text
Bulk ingest Added a method based on indexGPT folder ingester. Additional rst reader included. 2023-02-10 15:44:42 +00:00			`"""Epub parser.`

			`Contains parsers for epub files.`
			`"""`

			`from pathlib import Path`
			`from typing import Dict`

			`from parser.file.base_parser import BaseParser`


			`class EpubParser(BaseParser):`
			`"""Epub Parser."""`

			`def _init_parser(self) -> Dict:`
			`"""Init parser."""`
			`return {}`

			`def parse_file(self, file: Path, errors: str = "ignore") -> str:`
			`"""Parse file."""`
			`try:`
			`import ebooklib`
			`from ebooklib import epub`
			`except ImportError:`
			raise ValueError("`EbookLib` is required to read Epub files.")
			`try:`
			`import html2text`
			`except ImportError:`
			raise ValueError("`html2text` is required to parse Epub files.")

			`text_list = []`
			`book = epub.read_epub(file, options={"ignore_ncx": True})`

			`# Iterate through all chapters.`
			`for item in book.get_items():`
			`# Chapters are typically located in epub documents items.`
			`if item.get_type() == ebooklib.ITEM_DOCUMENT:`
			`text_list.append(`
			`html2text.html2text(item.get_content().decode("utf-8"))`
			`)`

			`text = "\n".join(text_list)`
			`return text`