From 9f0b63dba0ab194d14ad9e81245e61f1ec1032c2 Mon Sep 17 00:00:00 2001 From: Giulio Zani Date: Tue, 6 Feb 2024 01:18:57 +0100 Subject: [PATCH] experimental[patch]: Fixes issue #17060 (#17062) As described in issue #17060, in the case in which text has only one sentence the following function fails. Checking for that and adding a return case fixed the issue. ```python def split_text(self, text: str) -> List[str]: """Split text into multiple components.""" # Splitting the essay on '.', '?', and '!' single_sentences_list = re.split(r"(?<=[.?!])\s+", text) sentences = [ {"sentence": x, "index": i} for i, x in enumerate(single_sentences_list) ] sentences = combine_sentences(sentences) embeddings = self.embeddings.embed_documents( [x["combined_sentence"] for x in sentences] ) for i, sentence in enumerate(sentences): sentence["combined_sentence_embedding"] = embeddings[i] distances, sentences = calculate_cosine_distances(sentences) start_index = 0 # Create a list to hold the grouped sentences chunks = [] breakpoint_percentile_threshold = 95 breakpoint_distance_threshold = np.percentile( distances, breakpoint_percentile_threshold ) # If you want more chunks, lower the percentile cutoff indices_above_thresh = [ i for i, x in enumerate(distances) if x > breakpoint_distance_threshold ] # The indices of those breakpoints on your list # Iterate through the breakpoints to slice the sentences for index in indices_above_thresh: # The end index is the current breakpoint end_index = index # Slice the sentence_dicts from the current start index to the end index group = sentences[start_index : end_index + 1] combined_text = " ".join([d["sentence"] for d in group]) chunks.append(combined_text) # Update the start index for the next group start_index = index + 1 # The last group, if any sentences remain if start_index < len(sentences): combined_text = " ".join([d["sentence"] for d in sentences[start_index:]]) chunks.append(combined_text) return chunks ``` Co-authored-by: Giulio Zani --- libs/experimental/langchain_experimental/text_splitter.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/libs/experimental/langchain_experimental/text_splitter.py b/libs/experimental/langchain_experimental/text_splitter.py index 27818bb6e4..c032c33856 100644 --- a/libs/experimental/langchain_experimental/text_splitter.py +++ b/libs/experimental/langchain_experimental/text_splitter.py @@ -85,6 +85,12 @@ class SemanticChunker(BaseDocumentTransformer): """Split text into multiple components.""" # Splitting the essay on '.', '?', and '!' single_sentences_list = re.split(r"(?<=[.?!])\s+", text) + + # having len(single_sentences_list) == 1 would cause the following + # np.percentile to fail. + if len(single_sentences_list) == 1: + return single_sentences_list + sentences = [ {"sentence": x, "index": i} for i, x in enumerate(single_sentences_list) ]