Add strip text splits flag (#10295)

#10085
---------

Co-authored-by: codesee-maps[bot] <86324825+codesee-maps[bot]@users.noreply.github.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Ilya 2023-09-07 00:06:12 +03:00 committed by GitHub
parent 1690013711
commit 6e6f15df24
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -100,6 +100,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
length_function: Callable[[str], int] = len, length_function: Callable[[str], int] = len,
keep_separator: bool = False, keep_separator: bool = False,
add_start_index: bool = False, add_start_index: bool = False,
strip_whitespace: bool = True,
) -> None: ) -> None:
"""Create a new TextSplitter. """Create a new TextSplitter.
@ -109,6 +110,8 @@ class TextSplitter(BaseDocumentTransformer, ABC):
length_function: Function that measures the length of given chunks length_function: Function that measures the length of given chunks
keep_separator: Whether to keep the separator in the chunks keep_separator: Whether to keep the separator in the chunks
add_start_index: If `True`, includes chunk's start index in metadata add_start_index: If `True`, includes chunk's start index in metadata
strip_whitespace: If `True`, strips whitespace from the start and end of
every document
""" """
if chunk_overlap > chunk_size: if chunk_overlap > chunk_size:
raise ValueError( raise ValueError(
@ -120,6 +123,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
self._length_function = length_function self._length_function = length_function
self._keep_separator = keep_separator self._keep_separator = keep_separator
self._add_start_index = add_start_index self._add_start_index = add_start_index
self._strip_whitespace = strip_whitespace
@abstractmethod @abstractmethod
def split_text(self, text: str) -> List[str]: def split_text(self, text: str) -> List[str]:
@ -152,7 +156,8 @@ class TextSplitter(BaseDocumentTransformer, ABC):
def _join_docs(self, docs: List[str], separator: str) -> Optional[str]: def _join_docs(self, docs: List[str], separator: str) -> Optional[str]:
text = separator.join(docs) text = separator.join(docs)
text = text.strip() if self._strip_whitespace:
text = text.strip()
if text == "": if text == "":
return None return None
else: else: