mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
058a64c563
Hi 👋 First off, thanks a ton for your work on this 💚 Really appreciate what you're providing here for the community. ## Description This PR adds a basic language parser for the [Elixir](https://elixir-lang.org/) programming language. The parser code is based upon the approach outlined in https://github.com/langchain-ai/langchain/pull/13318: it's using `tree-sitter` under the hood and aligns with all the other `tree-sitter` based parses added that PR. The `CHUNK_QUERY` I'm using here is probably not the most sophisticated one, but it worked for my application. It's a starting point to provide "core" parsing support for Elixir in LangChain. It enables people to use the language parser out in real world applications which may then lead to further tweaking of the queries. I consider this PR just the ground work. - **Dependencies:** requires `tree-sitter` and `tree-sitter-languages` from the extended dependencies - **Twitter handle:**`@bitcrowd` ## Checklist - [x] **PR title**: "package: description" - [x] **Add tests and docs** - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. <!-- If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. -->
669 lines
21 KiB
Python
669 lines
21 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
from typing import Any, List, Literal, Optional, Union
|
|
|
|
from langchain_text_splitters.base import Language, TextSplitter
|
|
|
|
|
|
class CharacterTextSplitter(TextSplitter):
|
|
"""Splitting text that looks at characters."""
|
|
|
|
def __init__(
|
|
self, separator: str = "\n\n", is_separator_regex: bool = False, **kwargs: Any
|
|
) -> None:
|
|
"""Create a new TextSplitter."""
|
|
super().__init__(**kwargs)
|
|
self._separator = separator
|
|
self._is_separator_regex = is_separator_regex
|
|
|
|
def split_text(self, text: str) -> List[str]:
|
|
"""Split incoming text and return chunks."""
|
|
# First we naively split the large input into a bunch of smaller ones.
|
|
separator = (
|
|
self._separator if self._is_separator_regex else re.escape(self._separator)
|
|
)
|
|
splits = _split_text_with_regex(text, separator, self._keep_separator)
|
|
_separator = "" if self._keep_separator else self._separator
|
|
return self._merge_splits(splits, _separator)
|
|
|
|
|
|
def _split_text_with_regex(
|
|
text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]]
|
|
) -> List[str]:
|
|
# Now that we have the separator, split the text
|
|
if separator:
|
|
if keep_separator:
|
|
# The parentheses in the pattern keep the delimiters in the result.
|
|
_splits = re.split(f"({separator})", text)
|
|
splits = (
|
|
([_splits[i] + _splits[i + 1] for i in range(0, len(_splits) - 1, 2)])
|
|
if keep_separator == "end"
|
|
else ([_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)])
|
|
)
|
|
if len(_splits) % 2 == 0:
|
|
splits += _splits[-1:]
|
|
splits = (
|
|
(splits + [_splits[-1]])
|
|
if keep_separator == "end"
|
|
else ([_splits[0]] + splits)
|
|
)
|
|
else:
|
|
splits = re.split(separator, text)
|
|
else:
|
|
splits = list(text)
|
|
return [s for s in splits if s != ""]
|
|
|
|
|
|
class RecursiveCharacterTextSplitter(TextSplitter):
|
|
"""Splitting text by recursively look at characters.
|
|
|
|
Recursively tries to split by different characters to find one
|
|
that works.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
separators: Optional[List[str]] = None,
|
|
keep_separator: bool = True,
|
|
is_separator_regex: bool = False,
|
|
**kwargs: Any,
|
|
) -> None:
|
|
"""Create a new TextSplitter."""
|
|
super().__init__(keep_separator=keep_separator, **kwargs)
|
|
self._separators = separators or ["\n\n", "\n", " ", ""]
|
|
self._is_separator_regex = is_separator_regex
|
|
|
|
def _split_text(self, text: str, separators: List[str]) -> List[str]:
|
|
"""Split incoming text and return chunks."""
|
|
final_chunks = []
|
|
# Get appropriate separator to use
|
|
separator = separators[-1]
|
|
new_separators = []
|
|
for i, _s in enumerate(separators):
|
|
_separator = _s if self._is_separator_regex else re.escape(_s)
|
|
if _s == "":
|
|
separator = _s
|
|
break
|
|
if re.search(_separator, text):
|
|
separator = _s
|
|
new_separators = separators[i + 1 :]
|
|
break
|
|
|
|
_separator = separator if self._is_separator_regex else re.escape(separator)
|
|
splits = _split_text_with_regex(text, _separator, self._keep_separator)
|
|
|
|
# Now go merging things, recursively splitting longer texts.
|
|
_good_splits = []
|
|
_separator = "" if self._keep_separator else separator
|
|
for s in splits:
|
|
if self._length_function(s) < self._chunk_size:
|
|
_good_splits.append(s)
|
|
else:
|
|
if _good_splits:
|
|
merged_text = self._merge_splits(_good_splits, _separator)
|
|
final_chunks.extend(merged_text)
|
|
_good_splits = []
|
|
if not new_separators:
|
|
final_chunks.append(s)
|
|
else:
|
|
other_info = self._split_text(s, new_separators)
|
|
final_chunks.extend(other_info)
|
|
if _good_splits:
|
|
merged_text = self._merge_splits(_good_splits, _separator)
|
|
final_chunks.extend(merged_text)
|
|
return final_chunks
|
|
|
|
def split_text(self, text: str) -> List[str]:
|
|
return self._split_text(text, self._separators)
|
|
|
|
@classmethod
|
|
def from_language(
|
|
cls, language: Language, **kwargs: Any
|
|
) -> RecursiveCharacterTextSplitter:
|
|
separators = cls.get_separators_for_language(language)
|
|
return cls(separators=separators, is_separator_regex=True, **kwargs)
|
|
|
|
@staticmethod
|
|
def get_separators_for_language(language: Language) -> List[str]:
|
|
if language == Language.CPP:
|
|
return [
|
|
# Split along class definitions
|
|
"\nclass ",
|
|
# Split along function definitions
|
|
"\nvoid ",
|
|
"\nint ",
|
|
"\nfloat ",
|
|
"\ndouble ",
|
|
# Split along control flow statements
|
|
"\nif ",
|
|
"\nfor ",
|
|
"\nwhile ",
|
|
"\nswitch ",
|
|
"\ncase ",
|
|
# Split by the normal type of lines
|
|
"\n\n",
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.GO:
|
|
return [
|
|
# Split along function definitions
|
|
"\nfunc ",
|
|
"\nvar ",
|
|
"\nconst ",
|
|
"\ntype ",
|
|
# Split along control flow statements
|
|
"\nif ",
|
|
"\nfor ",
|
|
"\nswitch ",
|
|
"\ncase ",
|
|
# Split by the normal type of lines
|
|
"\n\n",
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.JAVA:
|
|
return [
|
|
# Split along class definitions
|
|
"\nclass ",
|
|
# Split along method definitions
|
|
"\npublic ",
|
|
"\nprotected ",
|
|
"\nprivate ",
|
|
"\nstatic ",
|
|
# Split along control flow statements
|
|
"\nif ",
|
|
"\nfor ",
|
|
"\nwhile ",
|
|
"\nswitch ",
|
|
"\ncase ",
|
|
# Split by the normal type of lines
|
|
"\n\n",
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.KOTLIN:
|
|
return [
|
|
# Split along class definitions
|
|
"\nclass ",
|
|
# Split along method definitions
|
|
"\npublic ",
|
|
"\nprotected ",
|
|
"\nprivate ",
|
|
"\ninternal ",
|
|
"\ncompanion ",
|
|
"\nfun ",
|
|
"\nval ",
|
|
"\nvar ",
|
|
# Split along control flow statements
|
|
"\nif ",
|
|
"\nfor ",
|
|
"\nwhile ",
|
|
"\nwhen ",
|
|
"\ncase ",
|
|
"\nelse ",
|
|
# Split by the normal type of lines
|
|
"\n\n",
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.JS:
|
|
return [
|
|
# Split along function definitions
|
|
"\nfunction ",
|
|
"\nconst ",
|
|
"\nlet ",
|
|
"\nvar ",
|
|
"\nclass ",
|
|
# Split along control flow statements
|
|
"\nif ",
|
|
"\nfor ",
|
|
"\nwhile ",
|
|
"\nswitch ",
|
|
"\ncase ",
|
|
"\ndefault ",
|
|
# Split by the normal type of lines
|
|
"\n\n",
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.TS:
|
|
return [
|
|
"\nenum ",
|
|
"\ninterface ",
|
|
"\nnamespace ",
|
|
"\ntype ",
|
|
# Split along class definitions
|
|
"\nclass ",
|
|
# Split along function definitions
|
|
"\nfunction ",
|
|
"\nconst ",
|
|
"\nlet ",
|
|
"\nvar ",
|
|
# Split along control flow statements
|
|
"\nif ",
|
|
"\nfor ",
|
|
"\nwhile ",
|
|
"\nswitch ",
|
|
"\ncase ",
|
|
"\ndefault ",
|
|
# Split by the normal type of lines
|
|
"\n\n",
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.PHP:
|
|
return [
|
|
# Split along function definitions
|
|
"\nfunction ",
|
|
# Split along class definitions
|
|
"\nclass ",
|
|
# Split along control flow statements
|
|
"\nif ",
|
|
"\nforeach ",
|
|
"\nwhile ",
|
|
"\ndo ",
|
|
"\nswitch ",
|
|
"\ncase ",
|
|
# Split by the normal type of lines
|
|
"\n\n",
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.PROTO:
|
|
return [
|
|
# Split along message definitions
|
|
"\nmessage ",
|
|
# Split along service definitions
|
|
"\nservice ",
|
|
# Split along enum definitions
|
|
"\nenum ",
|
|
# Split along option definitions
|
|
"\noption ",
|
|
# Split along import statements
|
|
"\nimport ",
|
|
# Split along syntax declarations
|
|
"\nsyntax ",
|
|
# Split by the normal type of lines
|
|
"\n\n",
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.PYTHON:
|
|
return [
|
|
# First, try to split along class definitions
|
|
"\nclass ",
|
|
"\ndef ",
|
|
"\n\tdef ",
|
|
# Now split by the normal type of lines
|
|
"\n\n",
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.RST:
|
|
return [
|
|
# Split along section titles
|
|
"\n=+\n",
|
|
"\n-+\n",
|
|
"\n\\*+\n",
|
|
# Split along directive markers
|
|
"\n\n.. *\n\n",
|
|
# Split by the normal type of lines
|
|
"\n\n",
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.RUBY:
|
|
return [
|
|
# Split along method definitions
|
|
"\ndef ",
|
|
"\nclass ",
|
|
# Split along control flow statements
|
|
"\nif ",
|
|
"\nunless ",
|
|
"\nwhile ",
|
|
"\nfor ",
|
|
"\ndo ",
|
|
"\nbegin ",
|
|
"\nrescue ",
|
|
# Split by the normal type of lines
|
|
"\n\n",
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.ELIXIR:
|
|
return [
|
|
# Split along method function and module definiton
|
|
"\ndef ",
|
|
"\ndefp ",
|
|
"\ndefmodule ",
|
|
"\ndefprotocol ",
|
|
"\ndefmacro ",
|
|
"\ndefmacrop ",
|
|
# Split along control flow statements
|
|
"\nif ",
|
|
"\nunless ",
|
|
"\nwhile ",
|
|
"\ncase ",
|
|
"\ncond ",
|
|
"\nwith ",
|
|
"\nfor ",
|
|
"\ndo ",
|
|
# Split by the normal type of lines
|
|
"\n\n",
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.RUST:
|
|
return [
|
|
# Split along function definitions
|
|
"\nfn ",
|
|
"\nconst ",
|
|
"\nlet ",
|
|
# Split along control flow statements
|
|
"\nif ",
|
|
"\nwhile ",
|
|
"\nfor ",
|
|
"\nloop ",
|
|
"\nmatch ",
|
|
"\nconst ",
|
|
# Split by the normal type of lines
|
|
"\n\n",
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.SCALA:
|
|
return [
|
|
# Split along class definitions
|
|
"\nclass ",
|
|
"\nobject ",
|
|
# Split along method definitions
|
|
"\ndef ",
|
|
"\nval ",
|
|
"\nvar ",
|
|
# Split along control flow statements
|
|
"\nif ",
|
|
"\nfor ",
|
|
"\nwhile ",
|
|
"\nmatch ",
|
|
"\ncase ",
|
|
# Split by the normal type of lines
|
|
"\n\n",
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.SWIFT:
|
|
return [
|
|
# Split along function definitions
|
|
"\nfunc ",
|
|
# Split along class definitions
|
|
"\nclass ",
|
|
"\nstruct ",
|
|
"\nenum ",
|
|
# Split along control flow statements
|
|
"\nif ",
|
|
"\nfor ",
|
|
"\nwhile ",
|
|
"\ndo ",
|
|
"\nswitch ",
|
|
"\ncase ",
|
|
# Split by the normal type of lines
|
|
"\n\n",
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.MARKDOWN:
|
|
return [
|
|
# First, try to split along Markdown headings (starting with level 2)
|
|
"\n#{1,6} ",
|
|
# Note the alternative syntax for headings (below) is not handled here
|
|
# Heading level 2
|
|
# ---------------
|
|
# End of code block
|
|
"```\n",
|
|
# Horizontal lines
|
|
"\n\\*\\*\\*+\n",
|
|
"\n---+\n",
|
|
"\n___+\n",
|
|
# Note that this splitter doesn't handle horizontal lines defined
|
|
# by *three or more* of ***, ---, or ___, but this is not handled
|
|
"\n\n",
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.LATEX:
|
|
return [
|
|
# First, try to split along Latex sections
|
|
"\n\\\\chapter{",
|
|
"\n\\\\section{",
|
|
"\n\\\\subsection{",
|
|
"\n\\\\subsubsection{",
|
|
# Now split by environments
|
|
"\n\\\\begin{enumerate}",
|
|
"\n\\\\begin{itemize}",
|
|
"\n\\\\begin{description}",
|
|
"\n\\\\begin{list}",
|
|
"\n\\\\begin{quote}",
|
|
"\n\\\\begin{quotation}",
|
|
"\n\\\\begin{verse}",
|
|
"\n\\\\begin{verbatim}",
|
|
# Now split by math environments
|
|
"\n\\\begin{align}",
|
|
"$$",
|
|
"$",
|
|
# Now split by the normal type of lines
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.HTML:
|
|
return [
|
|
# First, try to split along HTML tags
|
|
"<body",
|
|
"<div",
|
|
"<p",
|
|
"<br",
|
|
"<li",
|
|
"<h1",
|
|
"<h2",
|
|
"<h3",
|
|
"<h4",
|
|
"<h5",
|
|
"<h6",
|
|
"<span",
|
|
"<table",
|
|
"<tr",
|
|
"<td",
|
|
"<th",
|
|
"<ul",
|
|
"<ol",
|
|
"<header",
|
|
"<footer",
|
|
"<nav",
|
|
# Head
|
|
"<head",
|
|
"<style",
|
|
"<script",
|
|
"<meta",
|
|
"<title",
|
|
"",
|
|
]
|
|
elif language == Language.CSHARP:
|
|
return [
|
|
"\ninterface ",
|
|
"\nenum ",
|
|
"\nimplements ",
|
|
"\ndelegate ",
|
|
"\nevent ",
|
|
# Split along class definitions
|
|
"\nclass ",
|
|
"\nabstract ",
|
|
# Split along method definitions
|
|
"\npublic ",
|
|
"\nprotected ",
|
|
"\nprivate ",
|
|
"\nstatic ",
|
|
"\nreturn ",
|
|
# Split along control flow statements
|
|
"\nif ",
|
|
"\ncontinue ",
|
|
"\nfor ",
|
|
"\nforeach ",
|
|
"\nwhile ",
|
|
"\nswitch ",
|
|
"\nbreak ",
|
|
"\ncase ",
|
|
"\nelse ",
|
|
# Split by exceptions
|
|
"\ntry ",
|
|
"\nthrow ",
|
|
"\nfinally ",
|
|
"\ncatch ",
|
|
# Split by the normal type of lines
|
|
"\n\n",
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.SOL:
|
|
return [
|
|
# Split along compiler information definitions
|
|
"\npragma ",
|
|
"\nusing ",
|
|
# Split along contract definitions
|
|
"\ncontract ",
|
|
"\ninterface ",
|
|
"\nlibrary ",
|
|
# Split along method definitions
|
|
"\nconstructor ",
|
|
"\ntype ",
|
|
"\nfunction ",
|
|
"\nevent ",
|
|
"\nmodifier ",
|
|
"\nerror ",
|
|
"\nstruct ",
|
|
"\nenum ",
|
|
# Split along control flow statements
|
|
"\nif ",
|
|
"\nfor ",
|
|
"\nwhile ",
|
|
"\ndo while ",
|
|
"\nassembly ",
|
|
# Split by the normal type of lines
|
|
"\n\n",
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.COBOL:
|
|
return [
|
|
# Split along divisions
|
|
"\nIDENTIFICATION DIVISION.",
|
|
"\nENVIRONMENT DIVISION.",
|
|
"\nDATA DIVISION.",
|
|
"\nPROCEDURE DIVISION.",
|
|
# Split along sections within DATA DIVISION
|
|
"\nWORKING-STORAGE SECTION.",
|
|
"\nLINKAGE SECTION.",
|
|
"\nFILE SECTION.",
|
|
# Split along sections within PROCEDURE DIVISION
|
|
"\nINPUT-OUTPUT SECTION.",
|
|
# Split along paragraphs and common statements
|
|
"\nOPEN ",
|
|
"\nCLOSE ",
|
|
"\nREAD ",
|
|
"\nWRITE ",
|
|
"\nIF ",
|
|
"\nELSE ",
|
|
"\nMOVE ",
|
|
"\nPERFORM ",
|
|
"\nUNTIL ",
|
|
"\nVARYING ",
|
|
"\nACCEPT ",
|
|
"\nDISPLAY ",
|
|
"\nSTOP RUN.",
|
|
# Split by the normal type of lines
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.LUA:
|
|
return [
|
|
# Split along variable and table definitions
|
|
"\nlocal ",
|
|
# Split along function definitions
|
|
"\nfunction ",
|
|
# Split along control flow statements
|
|
"\nif ",
|
|
"\nfor ",
|
|
"\nwhile ",
|
|
"\nrepeat ",
|
|
# Split by the normal type of lines
|
|
"\n\n",
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language == Language.HASKELL:
|
|
return [
|
|
# Split along function definitions
|
|
"\nmain :: ",
|
|
"\nmain = ",
|
|
"\nlet ",
|
|
"\nin ",
|
|
"\ndo ",
|
|
"\nwhere ",
|
|
"\n:: ",
|
|
"\n= ",
|
|
# Split along type declarations
|
|
"\ndata ",
|
|
"\nnewtype ",
|
|
"\ntype ",
|
|
"\n:: ",
|
|
# Split along module declarations
|
|
"\nmodule ",
|
|
# Split along import statements
|
|
"\nimport ",
|
|
"\nqualified ",
|
|
"\nimport qualified ",
|
|
# Split along typeclass declarations
|
|
"\nclass ",
|
|
"\ninstance ",
|
|
# Split along case expressions
|
|
"\ncase ",
|
|
# Split along guards in function definitions
|
|
"\n| ",
|
|
# Split along record field declarations
|
|
"\ndata ",
|
|
"\n= {",
|
|
"\n, ",
|
|
# Split by the normal type of lines
|
|
"\n\n",
|
|
"\n",
|
|
" ",
|
|
"",
|
|
]
|
|
elif language in Language._value2member_map_:
|
|
raise ValueError(f"Language {language} is not implemented yet!")
|
|
else:
|
|
raise ValueError(
|
|
f"Language {language} is not supported! "
|
|
f"Please choose from {list(Language)}"
|
|
)
|