SemanticChunker : Feature Addition ("Semantic Splitting with gradient") (#22895)

```SemanticChunker``` currently provide three methods to split the texts semantically: - percentile - standard_deviation - interquartile I propose new method ```gradient```. In this method, the gradient of distance is used to split chunks along with the percentile method (technically) . This method is useful when chunks are highly correlated with each other or specific to a domain e.g. legal or medical. The idea is to apply anomaly detection on gradient array so that the distribution become wider and easy to identify boundaries in highly semantic data. I have tested this merge on a set of 10 domain specific documents (mostly legal). Details : - **Issue:** Improvement - **Dependencies:** NA - **Twitter handle:** [x.com/prajapat_ravi](https://x.com/prajapat_ravi) @hwchase17 --------- Co-authored-by: Raviraj Prajapat <raviraj.prajapat@sirionlabs.com> Co-authored-by: isaac hershenson <ihershenson@hmc.edu>
4 months ago · 858ce264ef
parent 55705c0f5e
commit 858ce264ef
2 changed files with 82 additions and 10 deletions
--- a/docs/docs/how_to/semantic-chunker.ipynb
+++ b/docs/docs/how_to/semantic-chunker.ipynb
@ -297,13 +297,67 @@
    "print(len(docs))"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Gradient\n",
+    "\n",
+    "In this method, the gradient of distance is used to split chunks along with the percentile method.\n",
+    "This method is useful when chunks are highly correlated with each other or specific to a domain e.g. legal or medical. The idea is to apply anomaly detection on gradient array so that the distribution become wider and easy to identify boundaries in highly semantic data."
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "423c6e099e94ca69"
+  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1f65472",
   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
+    "text_splitter = SemanticChunker(\n",
+    "    OpenAIEmbeddings(), breakpoint_threshold_type=\"gradient\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Madam Speaker, Madam Vice President, our First Lady and Second Gentleman.\n"
+     ]
+    }
+   ],
+   "source": [
+    "docs = text_splitter.create_documents([state_of_the_union])\n",
+    "print(docs[0].page_content)"
+   ],
+   "metadata": {},
+   "id": "e9f393d316ce1f6c"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "26\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(docs))"
+   ],
+   "metadata": {},
+   "id": "a407cd57f02a0db4"
  }
 ],
 "metadata": {
--- a/libs/experimental/langchain_experimental/text_splitter.py
+++ b/libs/experimental/langchain_experimental/text_splitter.py
@ -84,11 +84,14 @@ def calculate_cosine_distances(sentences: List[dict]) -> Tuple[List[float], List
    return distances, sentences


-BreakpointThresholdType = Literal["percentile", "standard_deviation", "interquartile"]
+BreakpointThresholdType = Literal[
+    "percentile", "standard_deviation", "interquartile", "gradient"
+]
 BREAKPOINT_DEFAULTS: Dict[BreakpointThresholdType, float] = {
    "percentile": 95,
    "standard_deviation": 3,
    "interquartile": 1.5,
+    "gradient": 95,
 }


@ -127,23 +130,34 @@ class SemanticChunker(BaseDocumentTransformer):
        else:
            self.breakpoint_threshold_amount = breakpoint_threshold_amount

-    def _calculate_breakpoint_threshold(self, distances: List[float]) -> float:
+    def _calculate_breakpoint_threshold(
+        self, distances: List[float]
+    ) -> Tuple[float, List[float]]:
        if self.breakpoint_threshold_type == "percentile":
            return cast(
                float,
                np.percentile(distances, self.breakpoint_threshold_amount),
-            )
+            ), distances
        elif self.breakpoint_threshold_type == "standard_deviation":
            return cast(
                float,
                np.mean(distances)
                + self.breakpoint_threshold_amount * np.std(distances),
-            )
+            ), distances
        elif self.breakpoint_threshold_type == "interquartile":
            q1, q3 = np.percentile(distances, [25, 75])
            iqr = q3 - q1

-            return np.mean(distances) + self.breakpoint_threshold_amount * iqr
+            return np.mean(
+                distances
+            ) + self.breakpoint_threshold_amount * iqr, distances
+        elif self.breakpoint_threshold_type == "gradient":
+            # Calculate the threshold based on the distribution of gradient of distance array. # noqa: E501
+            distance_gradient = np.gradient(distances, range(0, len(distances)))
+            return cast(
+                float,
+                np.percentile(distance_gradient, self.breakpoint_threshold_amount),
+            ), distance_gradient
        else:
            raise ValueError(
                f"Got unexpected `breakpoint_threshold_type`: "
@ -201,13 +215,17 @@ class SemanticChunker(BaseDocumentTransformer):
        distances, sentences = self._calculate_sentence_distances(single_sentences_list)
        if self.number_of_chunks is not None:
            breakpoint_distance_threshold = self._threshold_from_clusters(distances)
+            breakpoint_array = distances
        else:
-            breakpoint_distance_threshold = self._calculate_breakpoint_threshold(
-                distances
-            )
+            (
+                breakpoint_distance_threshold,
+                breakpoint_array,
+            ) = self._calculate_breakpoint_threshold(distances)

        indices_above_thresh = [
-            i for i, x in enumerate(distances) if x > breakpoint_distance_threshold
+            i
+            for i, x in enumerate(breakpoint_array)
+            if x > breakpoint_distance_threshold
        ]

        chunks = []