text-splitters[minor]: Added Haskell support in langchain.text_splitter module (#16191)

- **Description:** Haskell language support added in text_splitter module - **Dependencies:** No - **Twitter handle:** @nisargtr If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17. --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
6 months ago · 1252ccce6f
parent b7344e3347
commit 1252ccce6f
4 changed files with 120 additions and 8 deletions
--- a/docs/docs/modules/data_connection/document_transformers/code_splitter.ipynb
+++ b/docs/docs/modules/data_connection/document_transformers/code_splitter.ipynb
@ -22,7 +22,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 4,
   "id": "a9e37aa1",
   "metadata": {},
   "outputs": [],
@ -35,7 +35,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 5,
   "id": "e21a2434",
   "metadata": {},
   "outputs": [
@ -61,10 +61,14 @@
       " 'html',\n",
       " 'sol',\n",
       " 'csharp',\n",
-       " 'cobol']"
+       " 'cobol',\n",
       " 'c',\n",
       " 'lua',\n",
       " 'perl',\n",
       " 'haskell']"
      ]
     },
-     "execution_count": 2,
+     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -564,13 +568,50 @@
    "c_docs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "af9de667-230e-4c2a-8c5f-122a28515d97",
   "metadata": {},
   "source": [
    "## Haskell\n",
    "Here's an example using the Haskell text splitter:"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "id": "688185b5",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
-   "source": []
+    {
     "data": {
      "text/plain": [
       "[Document(page_content='main :: IO ()'),\n",
       " Document(page_content='main = do\\n    putStrLn \"Hello, World!\"\\n-- Some'),\n",
       " Document(page_content='sample functions\\nadd :: Int -> Int -> Int\\nadd x y'),\n",
       " Document(page_content='= x + y')]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "HASKELL_CODE = \"\"\"\n",
    "main :: IO ()\n",
    "main = do\n",
    "    putStrLn \"Hello, World!\"\n",
    "-- Some sample functions\n",
    "add :: Int -> Int -> Int\n",
    "add x y = x + y\n",
    "\"\"\"\n",
    "haskell_splitter = RecursiveCharacterTextSplitter.from_language(\n",
    "    language=Language.HASKELL, chunk_size=50, chunk_overlap=0\n",
    ")\n",
    "haskell_docs = haskell_splitter.create_documents([HASKELL_CODE])\n",
    "haskell_docs"
   ]
  }
 ],
 "metadata": {
--- a/libs/text-splitters/langchain_text_splitters/base.py
+++ b/libs/text-splitters/langchain_text_splitters/base.py
@ -291,6 +291,7 @@ class Language(str, Enum):
    C = "c"
    LUA = "lua"
    PERL = "perl"
    HASKELL = "haskell"
@dataclass(frozen=True)
--- a/libs/text-splitters/langchain_text_splitters/character.py
+++ b/libs/text-splitters/langchain_text_splitters/character.py
@ -571,7 +571,45 @@ class RecursiveCharacterTextSplitter(TextSplitter):
                " ",
                "",
            ]
-
+        elif language == Language.HASKELL:
            return [
                # Split along function definitions
                "\nmain :: ",
                "\nmain = ",
                "\nlet ",
                "\nin ",
                "\ndo ",
                "\nwhere ",
                "\n:: ",
                "\n= ",
                # Split along type declarations
                "\ndata ",
                "\nnewtype ",
                "\ntype ",
                "\n:: ",
                # Split along module declarations
                "\nmodule ",
                # Split along import statements
                "\nimport ",
                "\nqualified ",
                "\nimport qualified ",
                # Split along typeclass declarations
                "\nclass ",
                "\ninstance ",
                # Split along case expressions
                "\ncase ",
                # Split along guards in function definitions
                "\n| ",
                # Split along record field declarations
                "\ndata ",
                "\n= {",
                "\n, ",
                # Split by the normal type of lines
                "\n\n",
                "\n",
                " ",
                "",
            ]
        else:
            raise ValueError(
                f"Language {language} is not supported! "
--- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py
+++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py
@ -1248,6 +1248,38 @@ def test_solidity_code_splitter() -> None:
    ]
 def test_haskell_code_splitter() -> None:
    splitter = RecursiveCharacterTextSplitter.from_language(
        Language.HASKELL, chunk_size=CHUNK_SIZE, chunk_overlap=0
    )
    code = """
        main :: IO ()
        main = do
          putStrLn "Hello, World!"
        -- Some sample functions
        add :: Int -> Int -> Int
        add x y = x + y
    """
    # Adjusted expected chunks to account for indentation and newlines
    expected_chunks = [
        "main ::",
        "IO ()",
        "main = do",
        "putStrLn",
        '"Hello, World!"',
        "--",
        "Some sample",
        "functions",
        "add :: Int ->",
        "Int -> Int",
        "add x y = x",
        "+ y",
    ]
    chunks = splitter.split_text(code)
    assert chunks == expected_chunks
@pytest.mark.requires("lxml")
 def test_html_header_text_splitter(tmp_path: Path) -> None:
    splitter = HTMLHeaderTextSplitter(