langchain/docs/extras/integrations/document_loaders/source_code.ipynb
2023-07-23 23:23:16 -07:00

421 lines
9.6 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "213a38a2",
"metadata": {},
"source": [
"# Source Code\n",
"\n",
"This notebook covers how to load source code files using a special approach with language parsing: each top-level function and class in the code is loaded into separate documents. Any remaining code top-level code outside the already loaded functions and classes will be loaded into a seperate document.\n",
"\n",
"This approach can potentially improve the accuracy of QA models over source code. Currently, the supported languages for code parsing are Python and JavaScript. The language used for parsing can be configured, along with the minimum number of lines required to activate the splitting based on syntax."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7fa47b2e",
"metadata": {},
"outputs": [],
"source": [
"! pip install esprima"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "beb55c2f",
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"from pprint import pprint\n",
"from langchain.text_splitter import Language\n",
"from langchain.document_loaders.generic import GenericLoader\n",
"from langchain.document_loaders.parsers import LanguageParser"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "64056e07",
"metadata": {},
"outputs": [],
"source": [
"loader = GenericLoader.from_filesystem(\n",
" \"./example_data/source_code\",\n",
" glob=\"*\",\n",
" suffixes=[\".py\", \".js\"],\n",
" parser=LanguageParser(),\n",
")\n",
"docs = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8af79bd7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(docs)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "85edf3fc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'content_type': 'functions_classes',\n",
" 'language': <Language.PYTHON: 'python'>,\n",
" 'source': 'example_data/source_code/example.py'}\n",
"{'content_type': 'functions_classes',\n",
" 'language': <Language.PYTHON: 'python'>,\n",
" 'source': 'example_data/source_code/example.py'}\n",
"{'content_type': 'simplified_code',\n",
" 'language': <Language.PYTHON: 'python'>,\n",
" 'source': 'example_data/source_code/example.py'}\n",
"{'content_type': 'functions_classes',\n",
" 'language': <Language.JS: 'js'>,\n",
" 'source': 'example_data/source_code/example.js'}\n",
"{'content_type': 'functions_classes',\n",
" 'language': <Language.JS: 'js'>,\n",
" 'source': 'example_data/source_code/example.js'}\n",
"{'content_type': 'simplified_code',\n",
" 'language': <Language.JS: 'js'>,\n",
" 'source': 'example_data/source_code/example.js'}\n"
]
}
],
"source": [
"for document in docs:\n",
" pprint(document.metadata)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "f44e3e37",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"class MyClass:\n",
" def __init__(self, name):\n",
" self.name = name\n",
"\n",
" def greet(self):\n",
" print(f\"Hello, {self.name}!\")\n",
"\n",
"--8<--\n",
"\n",
"def main():\n",
" name = input(\"Enter your name: \")\n",
" obj = MyClass(name)\n",
" obj.greet()\n",
"\n",
"--8<--\n",
"\n",
"# Code for: class MyClass:\n",
"\n",
"\n",
"# Code for: def main():\n",
"\n",
"\n",
"if __name__ == \"__main__\":\n",
" main()\n",
"\n",
"--8<--\n",
"\n",
"class MyClass {\n",
" constructor(name) {\n",
" this.name = name;\n",
" }\n",
"\n",
" greet() {\n",
" console.log(`Hello, ${this.name}!`);\n",
" }\n",
"}\n",
"\n",
"--8<--\n",
"\n",
"function main() {\n",
" const name = prompt(\"Enter your name:\");\n",
" const obj = new MyClass(name);\n",
" obj.greet();\n",
"}\n",
"\n",
"--8<--\n",
"\n",
"// Code for: class MyClass {\n",
"\n",
"// Code for: function main() {\n",
"\n",
"main();\n"
]
}
],
"source": [
"print(\"\\n\\n--8<--\\n\\n\".join([document.page_content for document in docs]))"
]
},
{
"cell_type": "markdown",
"id": "69aad0ed",
"metadata": {},
"source": [
"The parser can be disabled for small files. \n",
"\n",
"The parameter `parser_threshold` indicates the minimum number of lines that the source code file must have to be segmented using the parser."
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "ae024794",
"metadata": {},
"outputs": [],
"source": [
"loader = GenericLoader.from_filesystem(\n",
" \"./example_data/source_code\",\n",
" glob=\"*\",\n",
" suffixes=[\".py\"],\n",
" parser=LanguageParser(language=Language.PYTHON, parser_threshold=1000),\n",
")\n",
"docs = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "5d3b372a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(docs)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "89e546ad",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"class MyClass:\n",
" def __init__(self, name):\n",
" self.name = name\n",
"\n",
" def greet(self):\n",
" print(f\"Hello, {self.name}!\")\n",
"\n",
"\n",
"def main():\n",
" name = input(\"Enter your name: \")\n",
" obj = MyClass(name)\n",
" obj.greet()\n",
"\n",
"\n",
"if __name__ == \"__main__\":\n",
" main()\n",
"\n"
]
}
],
"source": [
"print(docs[0].page_content)"
]
},
{
"cell_type": "markdown",
"id": "c9c71e61",
"metadata": {},
"source": [
"## Splitting\n",
"\n",
"Additional splitting could be needed for those functions, classes, or scripts that are too big."
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "adbaa79f",
"metadata": {},
"outputs": [],
"source": [
"loader = GenericLoader.from_filesystem(\n",
" \"./example_data/source_code\",\n",
" glob=\"*\",\n",
" suffixes=[\".js\"],\n",
" parser=LanguageParser(language=Language.JS),\n",
")\n",
"docs = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "c44c0d3f",
"metadata": {},
"outputs": [],
"source": [
"from langchain.text_splitter import (\n",
" RecursiveCharacterTextSplitter,\n",
" Language,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "b1e0053d",
"metadata": {},
"outputs": [],
"source": [
"js_splitter = RecursiveCharacterTextSplitter.from_language(\n",
" language=Language.JS, chunk_size=60, chunk_overlap=0\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "7dbe6188",
"metadata": {},
"outputs": [],
"source": [
"result = js_splitter.split_documents(docs)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "8a80d089",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"7"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(result)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "000a6011",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"class MyClass {\n",
" constructor(name) {\n",
" this.name = name;\n",
"\n",
"--8<--\n",
"\n",
"}\n",
"\n",
"--8<--\n",
"\n",
"greet() {\n",
" console.log(`Hello, ${this.name}!`);\n",
" }\n",
"}\n",
"\n",
"--8<--\n",
"\n",
"function main() {\n",
" const name = prompt(\"Enter your name:\");\n",
"\n",
"--8<--\n",
"\n",
"const obj = new MyClass(name);\n",
" obj.greet();\n",
"}\n",
"\n",
"--8<--\n",
"\n",
"// Code for: class MyClass {\n",
"\n",
"// Code for: function main() {\n",
"\n",
"--8<--\n",
"\n",
"main();\n"
]
}
],
"source": [
"print(\"\\n\\n--8<--\\n\\n\".join([document.page_content for document in result]))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}