{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Code search\n", "\n", "We index our own [openai-python code repository](https://github.com/openai/openai-python), and show how it can be searched. We implement a simple version of file parsing and extracting of functions from python files." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total number of py files: 51\n", "Total number of functions extracted: 97\n" ] } ], "source": [ "import os\n", "from glob import glob\n", "import pandas as pd\n", "\n", "def get_function_name(code):\n", " \"\"\"\n", " Extract function name from a line beginning with \"def \"\n", " \"\"\"\n", " assert code.startswith(\"def \")\n", " return code[len(\"def \"): code.index(\"(\")]\n", "\n", "def get_until_no_space(all_lines, i) -> str:\n", " \"\"\"\n", " Get all lines until a line outside the function definition is found.\n", " \"\"\"\n", " ret = [all_lines[i]]\n", " for j in range(i + 1, i + 10000):\n", " if j < len(all_lines):\n", " if len(all_lines[j]) == 0 or all_lines[j][0] in [\" \", \"\\t\", \")\"]:\n", " ret.append(all_lines[j])\n", " else:\n", " break\n", " return \"\\n\".join(ret)\n", "\n", "def get_functions(filepath):\n", " \"\"\"\n", " Get all functions in a Python file.\n", " \"\"\"\n", " whole_code = open(filepath).read().replace(\"\\r\", \"\\n\")\n", " all_lines = whole_code.split(\"\\n\")\n", " for i, l in enumerate(all_lines):\n", " if l.startswith(\"def \"):\n", " code = get_until_no_space(all_lines, i)\n", " function_name = get_function_name(code)\n", " yield {\"code\": code, \"function_name\": function_name, \"filepath\": filepath}\n", "\n", "\n", "# get user root directory\n", "root_dir = os.path.expanduser(\"~\")\n", "# note: for this code to work, the openai-python repo must be downloaded and placed in your root directory\n", "\n", "# path to code repository directory\n", "code_root = root_dir + \"/openai-python\"\n", "\n", "code_files = [y for x in os.walk(code_root) for y in glob(os.path.join(x[0], '*.py'))]\n", "print(\"Total number of py files:\", len(code_files))\n", "\n", "if len(code_files) == 0:\n", " print(\"Double check that you have downloaded the openai-python repo and set the code_root variable correctly.\")\n", "\n", "all_funcs = []\n", "for code_file in code_files:\n", " funcs = list(get_functions(code_file))\n", " for func in funcs:\n", " all_funcs.append(func)\n", "\n", "print(\"Total number of functions extracted:\", len(all_funcs))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | code | \n", "function_name | \n", "filepath | \n", "code_embedding | \n", "
---|---|---|---|---|
0 | \n", "def _console_log_level():\\n if openai.log i... | \n", "_console_log_level | \n", "/openai/util.py | \n", "[0.03389773145318031, -0.004390408284962177, 0... | \n", "
1 | \n", "def log_debug(message, **params):\\n msg = l... | \n", "log_debug | \n", "/openai/util.py | \n", "[-0.004034275189042091, 0.004895383026450872, ... | \n", "
2 | \n", "def log_info(message, **params):\\n msg = lo... | \n", "log_info | \n", "/openai/util.py | \n", "[0.004882764536887407, 0.0033515947870910168, ... | \n", "
3 | \n", "def log_warn(message, **params):\\n msg = lo... | \n", "log_warn | \n", "/openai/util.py | \n", "[0.002535992069169879, -0.010829543694853783, ... | \n", "
4 | \n", "def logfmt(props):\\n def fmt(key, val):\\n ... | \n", "logfmt | \n", "/openai/util.py | \n", "[0.016732551157474518, 0.017367802560329437, 0... | \n", "