experimental[patch]: Makes regex customizable in text_splitter.py (SemanticChunker class) (#20485)

- **Description:** Currently, the regex is static (`r"(?<=[.?!])\s+"`),
which is only useful for certain use cases. The current change only
moves this to be a parameter of split_text(). Which adds flexibility
without making it more complex (as the default regex is still the same).
- **Issue:** Not applicable (I searched, no one seems to have created
this issue yet).
  - **Dependencies:** None.


_If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, hwchase17._

---------

Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
pull/20467/head
GustavoSept 1 month ago committed by GitHub
parent a936f696a6
commit c2d09a5186
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -112,12 +112,14 @@ class SemanticChunker(BaseDocumentTransformer):
breakpoint_threshold_type: BreakpointThresholdType = "percentile",
breakpoint_threshold_amount: Optional[float] = None,
number_of_chunks: Optional[int] = None,
sentence_split_regex: str = r"(?<=[.?!])\s+",
):
self._add_start_index = add_start_index
self.embeddings = embeddings
self.buffer_size = buffer_size
self.breakpoint_threshold_type = breakpoint_threshold_type
self.number_of_chunks = number_of_chunks
self.sentence_split_regex = sentence_split_regex
if breakpoint_threshold_amount is None:
self.breakpoint_threshold_amount = BREAKPOINT_DEFAULTS[
breakpoint_threshold_type
@ -189,8 +191,8 @@ class SemanticChunker(BaseDocumentTransformer):
self,
text: str,
) -> List[str]:
# Splitting the essay on '.', '?', and '!'
single_sentences_list = re.split(r"(?<=[.?!])\s+", text)
# Splitting the essay (by default on '.', '?', and '!')
single_sentences_list = re.split(self.sentence_split_regex, text)
# having len(single_sentences_list) == 1 would cause the following
# np.percentile to fail.

Loading…
Cancel
Save