langchain/libs/community/langchain_community/document_transformers/markdownify.py

import re
from typing import Any, List, Optional, Sequence, Union

from langchain_core.documents import BaseDocumentTransformer, Document


class MarkdownifyTransformer(BaseDocumentTransformer):
    """Converts HTML documents to Markdown format with customizable options for handling
    links, images, other tags and heading styles using the markdownify library.

    Arguments:
        strip: A list of tags to strip. This option can't be used with the convert option.
        convert: A list of tags to convert. This option can't be used with the strip option.
        autolinks: A boolean indicating whether the "automatic link" style should be used when a a tag's contents match its href. Defaults to True.
        heading_style: Defines how headings should be converted. Accepted values are ATX, ATX_CLOSED, SETEXT, and UNDERLINED (which is an alias for SETEXT). Defaults to ATX.
        **kwargs: Additional options to pass to markdownify.

    Example:
        .. code-block:: python
            from langchain_community.document_transformers import MarkdownifyTransformer
            markdownify = MarkdownifyTransformer()
            docs_transform = markdownify.transform_documents(docs)

    More configuration options can be found at the markdownify GitHub page:
    https://github.com/matthewwithanm/python-markdownify
    """  # noqa: E501

    def __init__(
        self,
        strip: Optional[Union[str, List[str]]] = None,
        convert: Optional[Union[str, List[str]]] = None,
        autolinks: bool = True,
        heading_style: str = "ATX",
        **kwargs: Any,
    ) -> None:
        self.strip = [strip] if isinstance(strip, str) else strip
        self.convert = [convert] if isinstance(convert, str) else convert
        self.autolinks = autolinks
        self.heading_style = heading_style
        self.additional_options = kwargs

    def transform_documents(
        self,
        documents: Sequence[Document],
        **kwargs: Any,
    ) -> Sequence[Document]:
        try:
            from markdownify import markdownify
        except ImportError:
            raise ImportError(
                """markdownify package not found, please 
                install it with `pip install markdownify`"""
            )

        converted_documents = []
        for doc in documents:
            markdown_content = (
                markdownify(
                    html=doc.page_content,
                    strip=self.strip,
                    convert=self.convert,
                    autolinks=self.autolinks,
                    heading_style=self.heading_style,
                    **self.additional_options,
                )
                .replace("\xa0", " ")
                .strip()
            )

            cleaned_markdown = re.sub(r"\n\s*\n", "\n\n", markdown_content)

            converted_documents.append(
                Document(cleaned_markdown, metadata=doc.metadata)
            )

        return converted_documents

    async def atransform_documents(
        self,
        documents: Sequence[Document],
        **kwargs: Any,
    ) -> Sequence[Document]:
        raise NotImplementedError
community: Add MarkdownifyTransformer to langchain_community.document_transformers (#21247) - Added new document_transformer: MarkdonifyTransformer, that uses `markdonify` package with customizable options to convert HTML to Markdown. It's similar to Html2TextTransformer, but has more flexible options and also I've noticed that sometimes MarkdownifyTransformer performs better than html2text one, so that's why I use markdownify on my project. - Added docs and tests - Usage: ```python from langchain_community.document_transformers import MarkdownifyTransformer markdownify = MarkdownifyTransformer() docs_transform = markdownify.transform_documents(docs) ``` - Example of better performance on simple task, that I've noticed: ``` <html> <head><title>Reports on product movement</title></head> <body> <p data-block-key="2wst7">The reports on product movement will be useful for forming supplier orders and controlling outcomes.</p> </body> ``` Html2TextTransformer: ```python [Document(page_content='The reports on product movement will be useful for forming supplier orders and\ncontrolling outcomes.\n\n')] # Here we can see 'and\ncontrolling', which has extra '\n' in it ``` MarkdownifyTranformer: ```python [Document(page_content='Reports on product movement\n\nThe reports on product movement will be useful for forming supplier orders and controlling outcomes.')] ``` --------- Co-authored-by: Sokolov Fedor <f.sokolov@sokolov-macbook.bbrouter> Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Sokolov Fedor <f.sokolov@sokolov-macbook.local> Co-authored-by: Sokolov Fedor <f.sokolov@192.168.1.6> 2024-05-08 21:45:13 +00:00			`import re`
			`from typing import Any, List, Optional, Sequence, Union`

			`from langchain_core.documents import BaseDocumentTransformer, Document`


			`class MarkdownifyTransformer(BaseDocumentTransformer):`
			`"""Converts HTML documents to Markdown format with customizable options for handling`
			`links, images, other tags and heading styles using the markdownify library.`

			`Arguments:`
			`strip: A list of tags to strip. This option can't be used with the convert option.`
			`convert: A list of tags to convert. This option can't be used with the strip option.`
			`autolinks: A boolean indicating whether the "automatic link" style should be used when a a tag's contents match its href. Defaults to True.`
			`heading_style: Defines how headings should be converted. Accepted values are ATX, ATX_CLOSED, SETEXT, and UNDERLINED (which is an alias for SETEXT). Defaults to ATX.`
			`**kwargs: Additional options to pass to markdownify.`

			`Example:`
			`.. code-block:: python`
			`from langchain_community.document_transformers import MarkdownifyTransformer`
			`markdownify = MarkdownifyTransformer()`
			`docs_transform = markdownify.transform_documents(docs)`

			`More configuration options can be found at the markdownify GitHub page:`
			`https://github.com/matthewwithanm/python-markdownify`
			`""" # noqa: E501`

			`def __init__(`
			`self,`
			`strip: Optional[Union[str, List[str]]] = None,`
			`convert: Optional[Union[str, List[str]]] = None,`
			`autolinks: bool = True,`
			`heading_style: str = "ATX",`
			`**kwargs: Any,`
			`) -> None:`
			`self.strip = [strip] if isinstance(strip, str) else strip`
			`self.convert = [convert] if isinstance(convert, str) else convert`
			`self.autolinks = autolinks`
			`self.heading_style = heading_style`
			`self.additional_options = kwargs`

			`def transform_documents(`
			`self,`
			`documents: Sequence[Document],`
			`**kwargs: Any,`
			`) -> Sequence[Document]:`
			`try:`
			`from markdownify import markdownify`
			`except ImportError:`
			`raise ImportError(`
			`"""markdownify package not found, please`
			install it with `pip install markdownify`"""
			`)`

			`converted_documents = []`
			`for doc in documents:`
			`markdown_content = (`
			`markdownify(`
			`html=doc.page_content,`
			`strip=self.strip,`
			`convert=self.convert,`
			`autolinks=self.autolinks,`
			`heading_style=self.heading_style,`
			`**self.additional_options,`
			`)`
			`.replace("\xa0", " ")`
			`.strip()`
			`)`

			`cleaned_markdown = re.sub(r"\n\s*\n", "\n\n", markdown_content)`

			`converted_documents.append(`
			`Document(cleaned_markdown, metadata=doc.metadata)`
			`)`

			`return converted_documents`

			`async def atransform_documents(`
			`self,`
			`documents: Sequence[Document],`
			`**kwargs: Any,`
			`) -> Sequence[Document]:`
			`raise NotImplementedError`