experimental[patch]: Makes regex customizable in text_splitter.py (SemanticChunker class) (#20485)

- **Description:** Currently, the regex is static (`r"(?<=[.?!])\s+"`),
which is only useful for certain use cases. The current change only
moves this to be a parameter of split_text(). Which adds flexibility
without making it more complex (as the default regex is still the same).
- **Issue:** Not applicable (I searched, no one seems to have created
this issue yet).
  - **Dependencies:** None.


_If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, hwchase17._

---------

Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
GustavoSept 2024-04-24 21:32:40 -03:00 committed by GitHub
parent a936f696a6
commit c2d09a5186
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -112,12 +112,14 @@ class SemanticChunker(BaseDocumentTransformer):
breakpoint_threshold_type: BreakpointThresholdType = "percentile", breakpoint_threshold_type: BreakpointThresholdType = "percentile",
breakpoint_threshold_amount: Optional[float] = None, breakpoint_threshold_amount: Optional[float] = None,
number_of_chunks: Optional[int] = None, number_of_chunks: Optional[int] = None,
sentence_split_regex: str = r"(?<=[.?!])\s+",
): ):
self._add_start_index = add_start_index self._add_start_index = add_start_index
self.embeddings = embeddings self.embeddings = embeddings
self.buffer_size = buffer_size self.buffer_size = buffer_size
self.breakpoint_threshold_type = breakpoint_threshold_type self.breakpoint_threshold_type = breakpoint_threshold_type
self.number_of_chunks = number_of_chunks self.number_of_chunks = number_of_chunks
self.sentence_split_regex = sentence_split_regex
if breakpoint_threshold_amount is None: if breakpoint_threshold_amount is None:
self.breakpoint_threshold_amount = BREAKPOINT_DEFAULTS[ self.breakpoint_threshold_amount = BREAKPOINT_DEFAULTS[
breakpoint_threshold_type breakpoint_threshold_type
@ -189,8 +191,8 @@ class SemanticChunker(BaseDocumentTransformer):
self, self,
text: str, text: str,
) -> List[str]: ) -> List[str]:
# Splitting the essay on '.', '?', and '!' # Splitting the essay (by default on '.', '?', and '!')
single_sentences_list = re.split(r"(?<=[.?!])\s+", text) single_sentences_list = re.split(self.sentence_split_regex, text)
# having len(single_sentences_list) == 1 would cause the following # having len(single_sentences_list) == 1 would cause the following
# np.percentile to fail. # np.percentile to fail.