langchain : text_splitters Added PowerShell (#24582)

- **Description:** Added PowerShell support for text splitters language
include docs relevant update
  - **Issue:** None
  - **Dependencies:** None

---------

Co-authored-by: tzitman <tamir.zitman@intel.com>
Co-authored-by: Chester Curme <chester.curme@gmail.com>
This commit is contained in:
Tamir Zitman 2024-07-30 19:13:52 +03:00 committed by GitHub
parent 187ee96f7a
commit b3e1378f2b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 119 additions and 5 deletions

View File

@ -54,7 +54,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "9e4144de-d925-4d4c-91c3-685ef8baa57c",
"id": "2bb9c73f-9d00-4a19-a81f-cab2f0fd921a",
"metadata": {},
"outputs": [],
"source": [
@ -63,7 +63,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 4,
"id": "a9e37aa1",
"metadata": {},
"outputs": [],
@ -718,8 +718,44 @@
"php_splitter = RecursiveCharacterTextSplitter.from_language(\n",
" language=Language.PHP, chunk_size=50, chunk_overlap=0\n",
")\n",
"haskell_docs = php_splitter.create_documents([PHP_CODE])\n",
"haskell_docs"
"php_docs = php_splitter.create_documents([PHP_CODE])\n",
"php_docs"
]
},
{
"cell_type": "markdown",
"id": "e9fa62c1",
"metadata": {},
"source": [
"## PowerShell\n",
"Here's an example using the PowerShell text splitter:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7e6893ad",
"metadata": {},
"outputs": [],
"source": [
"POWERSHELL_CODE = \"\"\"\n",
"$directoryPath = Get-Location\n",
"\n",
"$items = Get-ChildItem -Path $directoryPath\n",
"\n",
"$files = $items | Where-Object { -not $_.PSIsContainer }\n",
"\n",
"$sortedFiles = $files | Sort-Object LastWriteTime\n",
"\n",
"foreach ($file in $sortedFiles) {\n",
" Write-Output (\"Name: \" + $file.Name + \" | Last Write Time: \" + $file.LastWriteTime)\n",
"}\n",
"\"\"\"\n",
"powershell_splitter = RecursiveCharacterTextSplitter.from_language(\n",
" language=Language.POWERSHELL, chunk_size=100, chunk_overlap=0\n",
")\n",
"powershell_docs = powershell_splitter.create_documents([POWERSHELL_CODE])\n",
"powershell_docs"
]
}
],
@ -739,7 +775,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.5"
"version": "3.10.4"
}
},
"nbformat": 4,

View File

@ -294,6 +294,7 @@ class Language(str, Enum):
PERL = "perl"
HASKELL = "haskell"
ELIXIR = "elixir"
POWERSHELL = "powershell"
@dataclass(frozen=True)

View File

@ -659,6 +659,30 @@ class RecursiveCharacterTextSplitter(TextSplitter):
" ",
"",
]
elif language == Language.POWERSHELL:
return [
# Split along function definitions
"\nfunction ",
# Split along parameter declarations (escape parentheses)
"\nparam ",
# Split along control flow statements
"\nif ",
"\nforeach ",
"\nfor ",
"\nwhile ",
"\nswitch ",
# Split along class definitions (for PowerShell 5.0 and above)
"\nclass ",
# Split along try-catch-finally blocks
"\ntry ",
"\ncatch ",
"\nfinally ",
# Split by normal lines and empty spaces
"\n\n",
"\n",
" ",
"",
]
elif language in Language._value2member_map_:
raise ValueError(f"Language {language} is not implemented yet!")
else:

View File

@ -1974,3 +1974,56 @@ def test_split_json_many_calls() -> None:
assert chunk0 == chunk0_output
assert chunk1 == chunk1_output
def test_powershell_code_splitter_short_code() -> None:
splitter = RecursiveCharacterTextSplitter.from_language(
Language.POWERSHELL, chunk_size=60, chunk_overlap=0
)
code = """
# Check if a file exists
$filePath = "C:\\temp\\file.txt"
if (Test-Path $filePath) {
# File exists
} else {
# File does not exist
}
"""
chunks = splitter.split_text(code)
assert chunks == [
'# Check if a file exists\n$filePath = "C:\\temp\\file.txt"',
"if (Test-Path $filePath) {\n # File exists\n} else {",
"# File does not exist\n}",
]
def test_powershell_code_splitter_longer_code() -> None:
splitter = RecursiveCharacterTextSplitter.from_language(
Language.POWERSHELL, chunk_size=60, chunk_overlap=0
)
code = """
# Get a list of all processes and export to CSV
$processes = Get-Process
$processes | Export-Csv -Path "C:\\temp\\processes.csv" -NoTypeInformation
# Read the CSV file and display its content
$csvContent = Import-Csv -Path "C:\\temp\\processes.csv"
$csvContent | ForEach-Object {
$_.ProcessName
}
# End of script
"""
chunks = splitter.split_text(code)
assert chunks == [
"# Get a list of all processes and export to CSV",
"$processes = Get-Process",
'$processes | Export-Csv -Path "C:\\temp\\processes.csv"',
"-NoTypeInformation",
"# Read the CSV file and display its content",
'$csvContent = Import-Csv -Path "C:\\temp\\processes.csv"',
"$csvContent | ForEach-Object {\n $_.ProcessName\n}",
"# End of script",
]