experimental[patch]: Makes regex customizable in text_splitter.py (SemanticChunker class) (#20485)

- **Description:** Currently, the regex is static (`r"(?<=[.?!])\s+"`), which is only useful for certain use cases. The current change only moves this to be a parameter of split_text(). Which adds flexibility without making it more complex (as the default regex is still the same). - **Issue:** Not applicable (I searched, no one seems to have created this issue yet). - **Dependencies:** None. _If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, hwchase17._ --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
2024-10-31 15:20:26 +00:00 · 2024-04-24 21:32:40 -03:00 · 2024-04-24 21:32:40 -03:00 · c2d09a5186
commit c2d09a5186
parent a936f696a6
1 changed files with 4 additions and 2 deletions
--- a/libs/experimental/langchain_experimental/text_splitter.py
+++ b/libs/experimental/langchain_experimental/text_splitter.py
@ -112,12 +112,14 @@ class SemanticChunker(BaseDocumentTransformer):
        breakpoint_threshold_type: BreakpointThresholdType = "percentile",
        breakpoint_threshold_amount: Optional[float] = None,
        number_of_chunks: Optional[int] = None,
+        sentence_split_regex: str = r"(?<=[.?!])\s+",
    ):
        self._add_start_index = add_start_index
        self.embeddings = embeddings
        self.buffer_size = buffer_size
        self.breakpoint_threshold_type = breakpoint_threshold_type
        self.number_of_chunks = number_of_chunks
+        self.sentence_split_regex = sentence_split_regex
        if breakpoint_threshold_amount is None:
            self.breakpoint_threshold_amount = BREAKPOINT_DEFAULTS[
                breakpoint_threshold_type
@ -189,8 +191,8 @@ class SemanticChunker(BaseDocumentTransformer):
        self,
        text: str,
    ) -> List[str]:
-        # Splitting the essay on '.', '?', and '!'
-        single_sentences_list = re.split(r"(?<=[.?!])\s+", text)
+        # Splitting the essay (by default on '.', '?', and '!')
+        single_sentences_list = re.split(self.sentence_split_regex, text)

        # having len(single_sentences_list) == 1 would cause the following
        # np.percentile to fail.