From b8baead70ca2f93220a73e39918782ee5d23109c Mon Sep 17 00:00:00 2001 From: Davide Menini <48685774+dmenini@users.noreply.github.com> Date: Mon, 4 Sep 2023 00:10:25 +0200 Subject: [PATCH] fix (Html2TextTransformer): allow configuration of html2text (#9914) Hi, this PR enables configuring the html2text package, instead of being bound to use the hardcoded values. While simply passing `ignore_links` and `ignore_images` to the `transform_documents` method was possible, I preferred passing them to the `__init__` method for 2 reasons: 1. It is more efficient in case of subsequent calls to `transform_documents`. 2. It allows to move the "complexity" to the instantiation, keeping the actual execution simple and general enough. IMO the transformers should all follow this pattern, allowing something like this: ```python # Instantiate transformers transformers = [ TransformerA(foo='bar'), TransformerB(bar='foo'), # others ] # During execution, call them sequentially documents = ... for tr in transformers: documents = tr.transform_documents(documents) ``` Thanks for the reviews! --------- Co-authored-by: taamedag --- .../document_transformers/html2text.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/libs/langchain/langchain/document_transformers/html2text.py b/libs/langchain/langchain/document_transformers/html2text.py index ede87af9c7..3c123950d3 100644 --- a/libs/langchain/langchain/document_transformers/html2text.py +++ b/libs/langchain/langchain/document_transformers/html2text.py @@ -5,13 +5,22 @@ from langchain.schema import BaseDocumentTransformer, Document class Html2TextTransformer(BaseDocumentTransformer): """Replace occurrences of a particular search pattern with a replacement string + + Arguments: + ignore_links: Whether links should be ignored; defaults to True. + ignore_images: Whether images should be ignored; defaults to True. + Example: .. code-block:: python from langchain.document_transformers import Html2TextTransformer - html2text=Html2TextTransformer() - docs_transform=html2text.transform_documents(docs) + html2text = Html2TextTransformer() + docs_transform = html2text.transform_documents(docs) """ + def __init__(self, ignore_links: bool = True, ignore_images: bool = True) -> None: + self.ignore_links = ignore_links + self.ignore_images = ignore_images + def transform_documents( self, documents: Sequence[Document], @@ -25,10 +34,11 @@ class Html2TextTransformer(BaseDocumentTransformer): install it with `pip install html2text`""" ) - # Create an html2text.HTML2Text object and override some properties + # Create a html2text.HTML2Text object and override some properties h = html2text.HTML2Text() - h.ignore_links = True - h.ignore_images = True + h.ignore_links = self.ignore_links + h.ignore_images = self.ignore_images + for d in documents: d.page_content = h.handle(d.page_content) return documents