mirror of
https://github.com/hwchase17/langchain
synced 2024-11-10 01:10:59 +00:00
langchain : text_splitters Added PowerShell (#24582)
- **Description:** Added PowerShell support for text splitters language include docs relevant update - **Issue:** None - **Dependencies:** None --------- Co-authored-by: tzitman <tamir.zitman@intel.com> Co-authored-by: Chester Curme <chester.curme@gmail.com>
This commit is contained in:
parent
187ee96f7a
commit
b3e1378f2b
@ -54,7 +54,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9e4144de-d925-4d4c-91c3-685ef8baa57c",
|
||||
"id": "2bb9c73f-9d00-4a19-a81f-cab2f0fd921a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -63,7 +63,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 4,
|
||||
"id": "a9e37aa1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -718,8 +718,44 @@
|
||||
"php_splitter = RecursiveCharacterTextSplitter.from_language(\n",
|
||||
" language=Language.PHP, chunk_size=50, chunk_overlap=0\n",
|
||||
")\n",
|
||||
"haskell_docs = php_splitter.create_documents([PHP_CODE])\n",
|
||||
"haskell_docs"
|
||||
"php_docs = php_splitter.create_documents([PHP_CODE])\n",
|
||||
"php_docs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e9fa62c1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## PowerShell\n",
|
||||
"Here's an example using the PowerShell text splitter:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "7e6893ad",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"POWERSHELL_CODE = \"\"\"\n",
|
||||
"$directoryPath = Get-Location\n",
|
||||
"\n",
|
||||
"$items = Get-ChildItem -Path $directoryPath\n",
|
||||
"\n",
|
||||
"$files = $items | Where-Object { -not $_.PSIsContainer }\n",
|
||||
"\n",
|
||||
"$sortedFiles = $files | Sort-Object LastWriteTime\n",
|
||||
"\n",
|
||||
"foreach ($file in $sortedFiles) {\n",
|
||||
" Write-Output (\"Name: \" + $file.Name + \" | Last Write Time: \" + $file.LastWriteTime)\n",
|
||||
"}\n",
|
||||
"\"\"\"\n",
|
||||
"powershell_splitter = RecursiveCharacterTextSplitter.from_language(\n",
|
||||
" language=Language.POWERSHELL, chunk_size=100, chunk_overlap=0\n",
|
||||
")\n",
|
||||
"powershell_docs = powershell_splitter.create_documents([POWERSHELL_CODE])\n",
|
||||
"powershell_docs"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -739,7 +775,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.5"
|
||||
"version": "3.10.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -294,6 +294,7 @@ class Language(str, Enum):
|
||||
PERL = "perl"
|
||||
HASKELL = "haskell"
|
||||
ELIXIR = "elixir"
|
||||
POWERSHELL = "powershell"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
|
@ -659,6 +659,30 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.POWERSHELL:
|
||||
return [
|
||||
# Split along function definitions
|
||||
"\nfunction ",
|
||||
# Split along parameter declarations (escape parentheses)
|
||||
"\nparam ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nforeach ",
|
||||
"\nfor ",
|
||||
"\nwhile ",
|
||||
"\nswitch ",
|
||||
# Split along class definitions (for PowerShell 5.0 and above)
|
||||
"\nclass ",
|
||||
# Split along try-catch-finally blocks
|
||||
"\ntry ",
|
||||
"\ncatch ",
|
||||
"\nfinally ",
|
||||
# Split by normal lines and empty spaces
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language in Language._value2member_map_:
|
||||
raise ValueError(f"Language {language} is not implemented yet!")
|
||||
else:
|
||||
|
@ -1974,3 +1974,56 @@ def test_split_json_many_calls() -> None:
|
||||
|
||||
assert chunk0 == chunk0_output
|
||||
assert chunk1 == chunk1_output
|
||||
|
||||
|
||||
def test_powershell_code_splitter_short_code() -> None:
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.POWERSHELL, chunk_size=60, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
# Check if a file exists
|
||||
$filePath = "C:\\temp\\file.txt"
|
||||
if (Test-Path $filePath) {
|
||||
# File exists
|
||||
} else {
|
||||
# File does not exist
|
||||
}
|
||||
"""
|
||||
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == [
|
||||
'# Check if a file exists\n$filePath = "C:\\temp\\file.txt"',
|
||||
"if (Test-Path $filePath) {\n # File exists\n} else {",
|
||||
"# File does not exist\n}",
|
||||
]
|
||||
|
||||
|
||||
def test_powershell_code_splitter_longer_code() -> None:
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.POWERSHELL, chunk_size=60, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
# Get a list of all processes and export to CSV
|
||||
$processes = Get-Process
|
||||
$processes | Export-Csv -Path "C:\\temp\\processes.csv" -NoTypeInformation
|
||||
|
||||
# Read the CSV file and display its content
|
||||
$csvContent = Import-Csv -Path "C:\\temp\\processes.csv"
|
||||
$csvContent | ForEach-Object {
|
||||
$_.ProcessName
|
||||
}
|
||||
|
||||
# End of script
|
||||
"""
|
||||
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == [
|
||||
"# Get a list of all processes and export to CSV",
|
||||
"$processes = Get-Process",
|
||||
'$processes | Export-Csv -Path "C:\\temp\\processes.csv"',
|
||||
"-NoTypeInformation",
|
||||
"# Read the CSV file and display its content",
|
||||
'$csvContent = Import-Csv -Path "C:\\temp\\processes.csv"',
|
||||
"$csvContent | ForEach-Object {\n $_.ProcessName\n}",
|
||||
"# End of script",
|
||||
]
|
||||
|
Loading…
Reference in New Issue
Block a user