mirror of
https://github.com/hwchase17/langchain
synced 2024-11-04 06:00:26 +00:00
Add strip text splits flag (#10295)
#10085 --------- Co-authored-by: codesee-maps[bot] <86324825+codesee-maps[bot]@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
1690013711
commit
6e6f15df24
@ -100,6 +100,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
|
|||||||
length_function: Callable[[str], int] = len,
|
length_function: Callable[[str], int] = len,
|
||||||
keep_separator: bool = False,
|
keep_separator: bool = False,
|
||||||
add_start_index: bool = False,
|
add_start_index: bool = False,
|
||||||
|
strip_whitespace: bool = True,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Create a new TextSplitter.
|
"""Create a new TextSplitter.
|
||||||
|
|
||||||
@ -109,6 +110,8 @@ class TextSplitter(BaseDocumentTransformer, ABC):
|
|||||||
length_function: Function that measures the length of given chunks
|
length_function: Function that measures the length of given chunks
|
||||||
keep_separator: Whether to keep the separator in the chunks
|
keep_separator: Whether to keep the separator in the chunks
|
||||||
add_start_index: If `True`, includes chunk's start index in metadata
|
add_start_index: If `True`, includes chunk's start index in metadata
|
||||||
|
strip_whitespace: If `True`, strips whitespace from the start and end of
|
||||||
|
every document
|
||||||
"""
|
"""
|
||||||
if chunk_overlap > chunk_size:
|
if chunk_overlap > chunk_size:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@ -120,6 +123,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
|
|||||||
self._length_function = length_function
|
self._length_function = length_function
|
||||||
self._keep_separator = keep_separator
|
self._keep_separator = keep_separator
|
||||||
self._add_start_index = add_start_index
|
self._add_start_index = add_start_index
|
||||||
|
self._strip_whitespace = strip_whitespace
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def split_text(self, text: str) -> List[str]:
|
def split_text(self, text: str) -> List[str]:
|
||||||
@ -152,7 +156,8 @@ class TextSplitter(BaseDocumentTransformer, ABC):
|
|||||||
|
|
||||||
def _join_docs(self, docs: List[str], separator: str) -> Optional[str]:
|
def _join_docs(self, docs: List[str], separator: str) -> Optional[str]:
|
||||||
text = separator.join(docs)
|
text = separator.join(docs)
|
||||||
text = text.strip()
|
if self._strip_whitespace:
|
||||||
|
text = text.strip()
|
||||||
if text == "":
|
if text == "":
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user