{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Code search\n", "\n", "We index our own openai-python code repository, and show how it can be searched. We implement a simple version of file parsing and extracting of functions from python files." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total number of py files: 40\n", "Total number of functions extracted: 64\n" ] } ], "source": [ "import os\n", "from glob import glob\n", "import pandas as pd\n", "\n", "def get_function_name(code):\n", " \"\"\"\n", " Extract function name from a line beginning with \"def \"\n", " \"\"\"\n", " assert code.startswith(\"def \")\n", " return code[len(\"def \"): code.index(\"(\")]\n", "\n", "def get_until_no_space(all_lines, i) -> str:\n", " \"\"\"\n", " Get all lines until a line outside the function definition is found.\n", " \"\"\"\n", " ret = [all_lines[i]]\n", " for j in range(i + 1, i + 10000):\n", " if j < len(all_lines):\n", " if len(all_lines[j]) == 0 or all_lines[j][0] in [\" \", \"\\t\", \")\"]:\n", " ret.append(all_lines[j])\n", " else:\n", " break\n", " return \"\\n\".join(ret)\n", "\n", "def get_functions(filepath):\n", " \"\"\"\n", " Get all functions in a Python file.\n", " \"\"\"\n", " whole_code = open(filepath).read().replace(\"\\r\", \"\\n\")\n", " all_lines = whole_code.split(\"\\n\")\n", " for i, l in enumerate(all_lines):\n", " if l.startswith(\"def \"):\n", " code = get_until_no_space(all_lines, i)\n", " function_name = get_function_name(code)\n", " yield {\"code\": code, \"function_name\": function_name, \"filepath\": filepath}\n", "\n", "\n", "# get user root directory\n", "root_dir = os.path.expanduser(\"~\")\n", "\n", "# path to code repository directory\n", "code_root = root_dir + \"/openai-python\"\n", "code_files = [y for x in os.walk(code_root) for y in glob(os.path.join(x[0], '*.py'))]\n", "print(\"Total number of py files:\", len(code_files))\n", "all_funcs = []\n", "for code_file in code_files:\n", " funcs = list(get_functions(code_file))\n", " for func in funcs:\n", " all_funcs.append(func)\n", "\n", "print(\"Total number of functions extracted:\", len(all_funcs))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For code search models we use code-search-{model}-code to obtain embeddings for code snippets, and code-search-{model}-text to embed natural language queries." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | code | \n", "function_name | \n", "filepath | \n", "code_embedding | \n", "
---|---|---|---|---|
0 | \n", "def semantic_search(engine, query, documents):... | \n", "semantic_search | \n", "/examples/semanticsearch/semanticsearch.py | \n", "[-0.038976121693849564, -0.0031428150832653046... | \n", "
1 | \n", "def main():\\n parser = argparse.ArgumentPar... | \n", "main | \n", "/examples/semanticsearch/semanticsearch.py | \n", "[-0.024289356544613838, -0.017748363316059113,... | \n", "
2 | \n", "def get_candidates(\\n prompt: str,\\n sto... | \n", "get_candidates | \n", "/examples/codex/backtranslation.py | \n", "[-0.04161201789975166, -0.0169310811907053, 0.... | \n", "
3 | \n", "def rindex(lst: List, value: str) -> int:\\n ... | \n", "rindex | \n", "/examples/codex/backtranslation.py | \n", "[-0.027255680412054062, -0.007931121625006199,... | \n", "
4 | \n", "def eval_candidate(\\n candidate_answer: str... | \n", "eval_candidate | \n", "/examples/codex/backtranslation.py | \n", "[-0.00999179296195507, -0.01640152558684349, 0... | \n", "