mirror of
https://github.com/hwchase17/langchain
synced 2024-11-02 09:40:22 +00:00
experimental[patch]: Makes regex customizable in text_splitter.py (SemanticChunker class) (#20485)
- **Description:** Currently, the regex is static (`r"(?<=[.?!])\s+"`), which is only useful for certain use cases. The current change only moves this to be a parameter of split_text(). Which adds flexibility without making it more complex (as the default regex is still the same). - **Issue:** Not applicable (I searched, no one seems to have created this issue yet). - **Dependencies:** None. _If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, hwchase17._ --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
a936f696a6
commit
c2d09a5186
@ -112,12 +112,14 @@ class SemanticChunker(BaseDocumentTransformer):
|
|||||||
breakpoint_threshold_type: BreakpointThresholdType = "percentile",
|
breakpoint_threshold_type: BreakpointThresholdType = "percentile",
|
||||||
breakpoint_threshold_amount: Optional[float] = None,
|
breakpoint_threshold_amount: Optional[float] = None,
|
||||||
number_of_chunks: Optional[int] = None,
|
number_of_chunks: Optional[int] = None,
|
||||||
|
sentence_split_regex: str = r"(?<=[.?!])\s+",
|
||||||
):
|
):
|
||||||
self._add_start_index = add_start_index
|
self._add_start_index = add_start_index
|
||||||
self.embeddings = embeddings
|
self.embeddings = embeddings
|
||||||
self.buffer_size = buffer_size
|
self.buffer_size = buffer_size
|
||||||
self.breakpoint_threshold_type = breakpoint_threshold_type
|
self.breakpoint_threshold_type = breakpoint_threshold_type
|
||||||
self.number_of_chunks = number_of_chunks
|
self.number_of_chunks = number_of_chunks
|
||||||
|
self.sentence_split_regex = sentence_split_regex
|
||||||
if breakpoint_threshold_amount is None:
|
if breakpoint_threshold_amount is None:
|
||||||
self.breakpoint_threshold_amount = BREAKPOINT_DEFAULTS[
|
self.breakpoint_threshold_amount = BREAKPOINT_DEFAULTS[
|
||||||
breakpoint_threshold_type
|
breakpoint_threshold_type
|
||||||
@ -189,8 +191,8 @@ class SemanticChunker(BaseDocumentTransformer):
|
|||||||
self,
|
self,
|
||||||
text: str,
|
text: str,
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
# Splitting the essay on '.', '?', and '!'
|
# Splitting the essay (by default on '.', '?', and '!')
|
||||||
single_sentences_list = re.split(r"(?<=[.?!])\s+", text)
|
single_sentences_list = re.split(self.sentence_split_regex, text)
|
||||||
|
|
||||||
# having len(single_sentences_list) == 1 would cause the following
|
# having len(single_sentences_list) == 1 would cause the following
|
||||||
# np.percentile to fail.
|
# np.percentile to fail.
|
||||||
|
Loading…
Reference in New Issue
Block a user