mirror of
https://github.com/hwchase17/langchain
synced 2024-10-31 15:20:26 +00:00
experimental[patch]: Makes regex customizable in text_splitter.py (SemanticChunker class) (#20485)
- **Description:** Currently, the regex is static (`r"(?<=[.?!])\s+"`), which is only useful for certain use cases. The current change only moves this to be a parameter of split_text(). Which adds flexibility without making it more complex (as the default regex is still the same). - **Issue:** Not applicable (I searched, no one seems to have created this issue yet). - **Dependencies:** None. _If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, hwchase17._ --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
a936f696a6
commit
c2d09a5186
@ -112,12 +112,14 @@ class SemanticChunker(BaseDocumentTransformer):
|
||||
breakpoint_threshold_type: BreakpointThresholdType = "percentile",
|
||||
breakpoint_threshold_amount: Optional[float] = None,
|
||||
number_of_chunks: Optional[int] = None,
|
||||
sentence_split_regex: str = r"(?<=[.?!])\s+",
|
||||
):
|
||||
self._add_start_index = add_start_index
|
||||
self.embeddings = embeddings
|
||||
self.buffer_size = buffer_size
|
||||
self.breakpoint_threshold_type = breakpoint_threshold_type
|
||||
self.number_of_chunks = number_of_chunks
|
||||
self.sentence_split_regex = sentence_split_regex
|
||||
if breakpoint_threshold_amount is None:
|
||||
self.breakpoint_threshold_amount = BREAKPOINT_DEFAULTS[
|
||||
breakpoint_threshold_type
|
||||
@ -189,8 +191,8 @@ class SemanticChunker(BaseDocumentTransformer):
|
||||
self,
|
||||
text: str,
|
||||
) -> List[str]:
|
||||
# Splitting the essay on '.', '?', and '!'
|
||||
single_sentences_list = re.split(r"(?<=[.?!])\s+", text)
|
||||
# Splitting the essay (by default on '.', '?', and '!')
|
||||
single_sentences_list = re.split(self.sentence_split_regex, text)
|
||||
|
||||
# having len(single_sentences_list) == 1 would cause the following
|
||||
# np.percentile to fail.
|
||||
|
Loading…
Reference in New Issue
Block a user