openai-cookbook/solutions/web_crawl_Q&A/web-qa.ipynb

1286 lines
80 KiB
Plaintext
Raw Normal View History

2023-02-03 21:42:19 +00:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"https://openai.com/\n",
"https://openai.com/blog/tags/announcements\n",
"https://openai.com/blog/introducing-openai\n",
"https://openai.com/blog/authors/ilya\n",
"https://openai.com/blog/requests-for-research-2\n",
"https://openai.com/blog/authors/diederik\n",
"https://openai.com/blog/block-sparse-gpu-kernels\n",
"https://openai.com/blog/authors/alec\n",
"https://openai.com/blog/fine-tuning-gpt-2\n",
"https://openai.com/blog/authors/paul\n",
"https://openai.com/blog/concrete-ai-safety-problems\n",
"https://openai.com/blog/learning-to-summarize-with-human-feedback\n",
"https://openai.com/blog/authors/long\n",
"https://openai.com/blog/authors/lowe\n",
"https://openai.com/blog/learning-to-cooperate-compete-and-communicate\n",
"https://openai.com/blog/authors/jean\n",
"https://openai.com/blog/authors/igor\n",
"https://openai.com/blog/neural-mmo\n",
"https://openai.com/blog/authors/phillip\n",
"https://openai.com/blog/evolved-policy-gradients\n",
"https://openai.com/blog/authors/richard\n",
"https://openai.com/blog/better-exploration-with-parameter-noise\n",
"https://openai.com/blog/authors/xi\n",
"https://openai.com/blog/authors/matthias\n",
"https://openai.com/blog/solving-rubiks-cube\n",
"https://openai.com/blog/authors/ilge\n",
"https://openai.com/blog/vpt\n",
"https://openai.com/blog/authors/brandon\n",
"https://openai.com/blog/authors/raul\n",
"https://openai.com/blog/authors/bowen\n",
"https://openai.com/blog/authors/jie\n",
"https://openai.com/blog/tags/five\n",
"https://openai.com/blog/openai-five-benchmark-results\n",
"https://openai.com/blog/openai-five/#rapid\n",
"https://openai.com/blog/authors/henrique\n",
"https://openai.com/blog/authors/susan\n",
"https://openai.com/blog/authors/brooke\n",
"https://openai.com/blog/authors/michael-petrov\n",
"https://openai.com/blog/multimodal-neurons\n",
"https://openai.com/blog/authors/shan\n",
"https://openai.com/blog/authors/daniela\n",
"https://openai.com/blog/authors/nick\n",
"https://openai.com/blog/authors/chris\n",
"https://openai.com/blog/introducing-activation-atlases\n",
"https://openai.com/blog/authors/ludwig-schubert\n",
"https://openai.com/blog/authors/justin\n",
"https://openai.com/blog/authors/gabriel\n",
"https://openai.com/blog/microscope\n",
"https://openai.com/blog/authors/przemyslaw\n",
"https://openai.com/blog/authors/david\n",
"https://openai.com/blog/authors/jakub-pachocki\n",
"https://openai.com/blog/authors/christy\n",
"https://openai.com/blog/improving-language-model-behavior\n",
"https://openai.com/blog/authors/irene\n",
"https://openai.com/blog/gpt-2-1-5b-release\n",
"https://openai.com/blog/authors/jack-clark\n",
"https://openai.com/blog/cooperation-on-safety\n",
"https://openai.com/blog/authors/amanda\n",
"https://openai.com/blog/ai-safety-needs-social-scientists\n",
"https://openai.com/blog/adversarial-example-research\n",
"https://openai.com/blog/authors/sandy\n",
"https://openai.com/blog/authors/ian\n",
"https://openai.com/blog/machine-learning-unconference\n",
"https://openai.com/events/code-of-conduct.txt\n",
"https://openai.com/blog/authors/rocky\n",
"https://openai.com/blog/authors/nicolas\n",
"https://openai.com/blog/preparing-for-malicious-uses-of-ai\n",
"https://openai.com/blog/authors/michael\n",
"https://openai.com/blog/spam-detection-in-the-physical-world\n",
"https://openai.com/blog/authors/rachel\n",
"https://openai.com/blog/authors/alex-ray\n",
"https://openai.com/blog/generalizing-from-simulation\n",
"https://openai.com/blog/authors/lerrel\n",
"https://openai.com/blog/authors/xue\n",
"https://openai.com/blog/faster-robot-simulation-in-python\n",
"https://openai.com/blog/safety-gym\n",
"https://openai.com/blog/authors/joshua\n",
"https://openai.com/blog/spinning-up-in-deep-rl\n",
"https://openai.com/blog/spinning-up-in-deep-rl-workshop-review\n",
"https://openai.com/blog/hackathon-follow-up\n",
"https://openai.com/blog/authors/parnian\n",
"https://openai.com/blog/openai-hackathon\n",
"https://openai.com/events/hackathon.txt\n",
"https://openai.com/blog/authors/josh-tobin\n",
"https://openai.com/blog/report-from-the-self-organizing-conference\n",
"https://openai.com/blog/faulty-reward-functions\n",
"https://openai.com/blog/authors/miles\n",
"https://openai.com/blog/language-model-safety-and-misuse\n",
"https://openai.com/blog/authors/tyna\n",
"https://openai.com/blog/webgpt\n",
"https://openai.com/blog/authors/jacob-hilton\n",
"https://openai.com/blog/measuring-goodharts-law\n",
"https://openai.com/careers/research-engineer\n",
"https://openai.com/blog/authors/leo\n",
"https://openai.com/blog/learning-to-summarize-with-human-feedback/#optimizingtherewardmodel\n",
"https://openai.com/blog/procgen-benchmark\n",
"https://openai.com/blog/first-retro-contest-retrospective\n",
"https://openai.com/blog/authors/oleg\n",
"https://openai.com/blog/roboschool\n",
"https://openai.com/blog/gym-retro\n",
"https://openai.com/blog/authors/vicki\n",
"https://openai.com/blog/retro-contest\n",
"https://openai.com/blog/authors/alex\n",
"https://openai.com/blog/reptile\n",
"https://openai.com/blog/dall-e-2-pre-training-mitigations\n",
"https://openai.com/blog/authors/larissa\n",
"https://openai.com/blog/openai-scholars-2018-final-projects\n",
"https://openai.com/blog/authors/karl\n",
"https://openai.com/blog/grade-school-math\n",
"https://openai.com/blog/authors/vineet\n",
"https://openai.com/blog/authors/christopher\n",
"https://openai.com/blog/quantifying-generalization-in-reinforcement-learning\n",
"https://openai.com/blog/authors/reiichiro\n",
"https://openai.com/blog/authors/suchir\n",
"https://openai.com/blog/authors/katie\n",
"https://openai.com/blog/authors/sandhini\n",
"https://openai.com/blog/authors/pamela\n",
"https://openai.com/blog/authors/steven\n",
"https://openai.com/blog/authors/gretchen\n",
"https://openai.com/blog/authors/jan\n",
"https://openai.com/blog/critiques\n",
"https://openai.com/blog/authors/william-saunders\n",
"https://openai.com/blog/authors/catherine\n",
"https://openai.com/blog/our-approach-to-alignment-research\n",
"https://openai.com/blog/best-practices-for-deploying-language-models\n",
"https://openai.com/blog/instruction-following/#limitations\n",
"https://openai.com/blog/economic-impacts\n",
"https://openai.com/blog/authors/sam-manning\n",
"https://openai.com/scholars\n",
"https://openai.com/blog/openai-scholars-2021-final-projects\n",
"https://openai.com/blog/openai-scholars-2020-final-projects\n",
"https://openai.com/resources\n",
"https://openai.com/blog/openai-scholars-spring-2020\n",
"https://openai.com/blog/openai-scholars-class-of-19\n",
"https://openai.com/blog/openai-scholars-2019-final-projects\n",
"https://openai.com/blog/authors/jonathan\n",
"https://openai.com/blog/discovering-types-for-entity-disambiguation\n",
"https://openai.com/blog/openai-five-benchmark\n",
"https://openai.com/blog/openai-five-defeats-dota-2-world-champions/#arena\n",
"https://openai.com/blog/openai-five/#ourapproach\n",
"https://openai.com/blog/more-on-dota-2/#botexploits\n",
"https://openai.com/blog/openai-five-benchmark-results/#training\n",
"https://openai.com/blog/openai-five-finals\n",
"https://openai.com/five/#overview\n",
"https://openai.com/blog/dota-2\n",
"https://openai.com/the-international\n",
"https://openai.com/blog/more-on-dota-2\n",
"https://openai.com/blog/the-international-2018-results\n",
"https://openai.com/blog/openai-five-defeats-dota-2-world-champions/#cooperativemode\n",
"https://openai.com/blog/openai-five-defeats-dota-2-world-champions\n",
"https://openai.com/blog/authors/jeff\n",
"https://openai.com/blog/authors/adrien\n",
"https://openai.com/blog/authors/joost\n",
"https://openai.com/blog/authors/peter-zhokhov\n",
"https://openai.com/blog/authors/glenn\n",
"https://openai.com/blog/authors/peter\n",
"https://openai.com/blog/authors/raphael\n",
"https://openai.com/blog/authors/lilian\n",
"https://openai.com/blog/techniques-for-training-large-neural-networks\n",
"https://openai.com/blog/authors/alex-paino\n",
"https://openai.com/blog/authors/nikolas\n",
"https://openai.com/blog/openai-five\n",
"https://openai.com/blog/authors/bob\n",
"https://openai.com/blog/authors/qiming\n",
"https://openai.com/blog/authors/wojciech\n",
"https://openai.com/blog/authors/arthur\n",
"https://openai.com/blog/authors/mateusz\n",
"https://openai.com/blog/authors/maciek\n",
"https://openai.com/blog/authors/jerry\n",
"https://openai.com/blog/authors/lei\n",
"https://openai.com/blog/how-to-train-your-openai-five\n",
"https://openai.com/blog/authors/jonas-schneider\n",
"https://openai.com/jobs/#robotics\n",
"https://openai.com/interview-guide\n",
"https://openai.com/blog/learning-dexterity\n",
"https://openai.com/blog/authors/rafal\n",
"https://openai.com/blog/ingredients-for-robotics-research\n",
"https://openai.com/blog/authors/vikash\n",
"https://openai.com/blog/authors/marcin\n",
"https://openai.com/blog/authors/prafulla\n",
"https://openai.com/blog/authors/szymon-sidor\n",
"https://openai.com/blog/openai-baselines-dqn\n",
"https://openai.com/blog/authors/tamim\n",
"https://openai.com/blog/learning-montezumas-revenge-from-a-single-demonstration\n",
"https://openai.com/blog/authors/bradly\n",
"https://openai.com/blog/authors/rein\n",
"https://openai.com/blog/authors/jonathan-ho\n",
"https://openai.com/blog/learning-a-hierarchy\n",
"https://openai.com/blog/authors/peter-chen\n",
"https://openai.com/blog/authors/kevin\n",
"https://openai.com/blog/authors/filip\n",
"https://openai.com/five\n",
"https://openai.com/blog/authors/yilun\n",
"https://openai.com/blog/authors/joseph\n",
"https://openai.com/blog/interpretable-machine-learning-through-teaching\n",
"https://openai.com/blog/authors/smitha\n",
"https://openai.com/blog/learning-to-model-other-minds\n",
"https://openai.com/blog/authors/shimon\n",
"https://openai.com/blog/authors/maruan\n",
"https://openai.com/blog/authors/jakob-foerster\n",
"https://openai.com/blog/nonlinear-computation-in-linear-networks\n",
"https://openai.com/blog/energy-based-models\n",
"https://openai.com/blog/emergent-tool-use\n",
"https://openai.com/blog/authors/ingmar\n",
"https://openai.com/blog/authors/todor\n",
"https://openai.com/blog/learning-concepts-with-energy-functions\n",
"https://openai.com/blog/authors/yi\n",
"https://openai.com/blog/authors/pieter\n",
"https://openai.com/blog/authors/aviv\n",
"https://openai.com/blog/instruction-following\n",
"https://openai.com/blog/learning-to-communicate\n",
"https://openai.com/blog/authors/jon\n",
"https://openai.com/blog/summarizing-books\n",
"https://openai.com/blog/authors/chelsea\n",
"https://openai.com/blog/gathering_human_feedback\n",
"https://openai.com/blog/authors/dario-amodei\n",
"https://openai.com/blog/science-of-ai\n",
"https://openai.com/blog/authors/jared\n",
"https://openai.com/blog/authors/sam\n",
"https://openai.com/blog/gpt-2-6-month-follow-up\n",
"https://openai.com/blog/better-language-models/#update\n",
"https://openai.com/blog/authors/david-luan\n",
"https://openai.com/blog/authors/danny\n",
"https://openai.com/blog/ai-and-efficiency\n",
"https://openai.com/blog/authors/david-lansky\n",
"https://openai.com/blog/authors/tom\n",
"https://openai.com/blog/testing-robustness\n",
"https://openai.com/blog/authors/jacob\n",
"https://openai.com/blog/authors/yi-sun\n",
"https://openai.com/blog/authors/daniel\n",
"https://openai.com/blog/authors/dan\n",
"https://openai.com/blog/deep-reinforcement-learning-from-human-preferences\n",
"https://openai.com/blog/authors/geoffrey\n",
"https://openai.com/blog/debate\n",
"https://openai.com/blog/authors/jeffrey\n",
"https://openai.com/blog/authors/nisan\n",
"https://openai.com/blog/amplifying-ai-training\n",
"https://openai.com/blog/authors/daniel-ziegler\n",
"https://openai.com/blog/baselines-acktr-a2c\n",
"https://openai.com/blog/authors/yuhuai\n",
"https://openai.com/blog/authors/shun\n",
"https://openai.com/blog/authors/elman\n",
"https://openai.com/blog/openai-baselines-ppo\n",
"https://openai.com/blog/language-unsupervised\n",
"https://openai.com/blog/tags/baselines\n",
"https://openai.com/blog/authors/scott\n",
"https://openai.com/blog/sparse-transformer\n",
"https://openai.com/blog/authors/rewon\n",
"https://openai.com/blog/glow\n",
"https://openai.com/blog/authors/john\n",
"https://openai.com/blog/openai-gym-beta\n",
"https://openai.com/blog/authors/tim\n",
"https://openai.com/jobs\n",
"https://openai.com/blog/formal-math\n",
"https://openai.com/blog/authors/stanislas\n",
"https://openai.com/blog/authors/jesse\n",
"https://openai.com/blog/generative-models\n",
"https://openai.com/blog/authors/andrej\n",
"https://openai.com/blog/distill\n",
"https://openai.com/blog/authors/vicki-cheung\n",
"https://openai.com/blog/jukebox\n",
"https://openai.com/projects/five\n",
"https://openai.com/blog/authors/christine\n",
"https://openai.com/blog/authors/jong\n",
"https://openai.com/blog/authors/heewoo\n",
"https://openai.com/blog/musenet\n",
"https://openai.com/blog/better-language-models\n",
"https://openai.com/blog/robots-that-learn\n",
"https://openai.com/blog/authors/ankur\n",
"https://openai.com/blog/authors/erika-reinhardt\n",
"https://openai.com/blog/deep-double-descent\n",
"https://openai.com/blog/authors/tristan\n",
"https://openai.com/blog/authors/preetum\n",
"https://openai.com/blog/authors/boaz\n",
"https://openai.com/blog/authors/yamini\n",
"https://openai.com/blog/authors/gal\n",
"https://openai.com/blog/tags/gpt-2\n",
"https://openai.com/blog/clip\n",
"https://openai.com/blog/ai-and-compute\n",
"https://openai.com/blog/authors/girish\n",
"https://openai.com/blog/special-projects\n",
"https://openai.com/blog/authors/sam-altman\n",
"https://openai.com/blog/unsupervised-sentiment-neuron\n",
"https://openai.com/blog/dall-e\n",
"https://openai.com/blog/authors/aditya\n",
"https://openai.com/blog/authors/mark\n",
"https://openai.com/blog/authors/mikhail\n",
"https://openai.com/blog/authors/vedant\n",
"https://openai.com/blog/competitive-self-play\n",
"https://openai.com/blog/authors/trapit\n",
"https://openai.com/blog/meta-learning-for-wrestling\n",
"https://openai.com/blog/authors/yura\n",
"https://openai.com/blog/reinforcement-learning-with-prediction-based-rewards\n",
"https://openai.com/blog/authors/harri\n",
"https://openai.com/blog/image-gpt\n",
"https://openai.com/blog/evolution-strategies\n",
"https://openai.com/blog/infrastructure-for-deep-learning\n",
"https://openai.com/blog/generative-models/#gan\n",
"https://openai.com/blog/generative-models#improving-gans\n",
"https://openai.com/blog/tags/multimodal\n",
"https://openai.com/gpt-3\n",
"https://openai.com/javascript:setMathjaxCookie()\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/abs/2005.14165v1\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/list/cs.CL/new\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/abs/2005.14165v3\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/auth/show-endorsers/2005.14165\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/list/cs/recent\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/abs/2005.14165?context=cs\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/{url_path('ignore_me')}\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/abs/2005.14165v2\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/show-email/b5cb66e9/2005.14165\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/prevnext?id=2005.14165&function=next&context=cs.CL\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/format/2005.14165\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/prevnext?id=2005.14165&function=prev&context=cs.CL\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/pdf/2005.14165\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/tb/2005.14165\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/list/cs.CL/2005\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/list/cs.CL/recent\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/blog/dall-e-2\n",
"https://openai.com/blog/authors/openai\n",
"https://openai.com/blog/improving-verifiability\n",
"https://openai.com/blog/dall-e-2-extending-creativity\n",
"https://openai.com/blog/the-international\n",
"https://openai.com/blog/symposium-2019\n",
"https://openai.com/blog/tags/culture\n",
"https://openai.com/blog/learning-day\n",
"https://openai.com/blog/openai-fellows-fall-2018\n",
"https://openai.com/blog/neurips-2020\n",
"https://openai.com/blog/tags/community\n",
"https://openai.com/blog/universe\n",
"https://openai.com/blog/openai-gym-beta/#rl\n",
"https://openai.com/blog/openai-technical-goals/#goal4\n",
"https://openai.com/blog/authors/elon\n",
"https://openai.com/blog/scaling-kubernetes-to-7500-nodes\n",
"https://openai.com/blog/scaling-kubernetes-to-2500-nodes\n",
"https://openai.com/blog/authors/christopher-berner\n",
"https://openai.com/blog/authors/bchess\n",
"https://openai.com/blog/authors/eric\n",
"https://openai.com/blog/forecasting-misuse\n",
"https://openai.com/forecasting-misuse-paper\n",
"https://openai.com/prevnext?id=2301.04246&function=prev&context=cs.CY\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/auth/show-endorsers/2301.04246\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/format/2301.04246\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/pdf/2301.04246\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/show-email/64c5c6bd/2301.04246\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/list/cs.CY/recent\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/prevnext?id=2301.04246&function=next&context=cs.CY\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/list/cs.CY/new\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/list/cs.CY/2301\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/abs/2301.04246?context=cs\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/blog/authors/greg\n",
"https://openai.com/blog/dall-e-api-now-available-in-public-beta\n",
"https://openai.com/blog/api-no-waitlist\n",
"https://openai.com/blog/dall-e-introducing-outpainting\n",
"https://openai.com/blog/team-update\n",
"https://openai.com/blog/chatgpt-plus\n",
"https://openai.com/blog/openai-api\n",
"https://openai.com/jobs/#applied-ai\n",
"https://openai.com/blog/authors/mira\n",
"https://openai.com/join\n",
"Unable to parse page https://openai.com/join due to JavaScript being required\n",
"HTTP Error 403: Forbidden\n",
"https://openai.com/blog/tags/residency\n",
"https://openai.com/blog/openai-licenses-gpt-3-technology-to-microsoft\n",
"https://openai.com/blog/microsoft\n",
"https://openai.com/blog/team-update-august\n",
"https://openai.com/blog/new-ai-classifier-for-indicating-ai-written-text\n",
"https://openai.com/blog/authors/lama\n",
"https://openai.com/blog/authors/scott-aaronson\n",
"https://openai.com/blog/authors/jan-hendrik-kirchner\n",
"https://openai.com/blog/tags/api\n",
"https://openai.com/blog/openai-fellows\n",
"https://openai.com/blog/tags/scholars\n",
"https://openai.com/blog/openai-and-microsoft-extend-partnership\n",
"https://openai.com/blog/dall-e-now-available-without-waitlist\n",
"https://openai.com/blog/helen-toner-joins\n",
"https://openai.com/blog/team-update-january\n",
"https://openai.com/blog/team-plus-plus#interns\n",
"https://openai.com/blog/openai-codex\n",
"https://openai.com/blog/openai-scholars-2019\n",
"https://openai.com/blog/authors/ashley\n",
"https://openai.com/blog/openai-scholars\n",
"https://openai.com/blog/dall-e-now-available-in-beta\n",
"https://openai.com/blog/new-and-improved-embedding-model\n",
"https://openai.com/blog/authors/ryan\n",
"https://openai.com/blog/authors/arvind\n",
"https://openai.com/blog/authors/ted\n",
"https://openai.com/blog/dall-e-2-update\n",
"https://openai.com/blog/authors/joanne\n",
"https://openai.com/blog/tags/fellows\n",
"https://openai.com/blog/openai-summer-fellows-2018\n",
"https://openai.com/blog/authors/maddie\n",
"https://openai.com/blog/codex-apps\n",
"https://openai.com/blog/codex\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/blog/new-and-improved-content-moderation-tooling\n",
"https://openai.com/blog/authors/teddy\n",
"https://openai.com/blog/authors/angela\n",
"https://openai.com/blog/authors/chong\n",
"https://openai.com/blog/welcome-pieter-and-shivon\n",
"https://openai.com/blog/openai-technical-goals\n",
"https://openai.com/blog/procgen-minerl-competitions\n",
"https://openai.com/blog/will-hurd-joins\n",
"https://openai.com/blog/fund\n",
"https://openai.com/news\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/news/introducing-our-first-investments\n",
"HTTP Error 404: Not Found\n",
"https://openai.com/blog/introducing-text-and-code-embeddings\n",
"https://openai.com/blog/authors/boris\n",
"https://openai.com/blog/openai-scholars-2018-meet-our-scholars\n",
"https://openai.com/blog/team-plus-plus\n",
"https://openai.com/blog/gpt-3-apps\n",
"https://openai.com/jobs/#open\n",
"https://openai.com/blog/customized-gpt-3\n",
"https://openai.com/blog/authors/luke\n",
"https://openai.com/blog/authors/rachel-lim\n",
"https://openai.com/blog/authors/michael-wu\n",
"https://openai.com/blog/openai-supporters\n",
"https://openai.com/blog/openai-residency\n",
"https://openai.com/blog/leadership-team-update\n",
"https://openai.com/blog/organizational-update\n",
"https://openai.com/blog/openai-fellows-interns-2019\n",
"https://openai.com/blog/openai-scholars-2020\n",
"https://openai.com/blog/gpt-3-edit-insert\n",
"https://openai.com/blog/authors/mo\n",
"https://openai.com/blog/openai-pytorch\n",
"https://openai.com/blog/openai-scholars-2019-meet-our-scholars\n",
"https://openai.com/blog/openai-charter\n",
"https://openai.com/blog/openai-and-microsoft\n",
"https://openai.com/blog/openai-lp\n",
"https://openai.com/blog/reducing-bias-and-improving-safety-in-dall-e-2\n",
"https://openai.com/terms\n",
"https://openai.com/api/policies/service-terms\n",
"https://openai.com/api/policies/sharing-publication\n",
"https://openai.com/api/policies/terms\n",
"https://openai.com/security/disclosure\n",
"https://openai.com/blog/whisper\n",
"https://openai.com/blog/authors/tao\n",
"https://openai.com/research\n",
"https://openai.com/api/docs\n",
"Unable to parse page https://openai.com/api/docs due to JavaScript being required\n",
"HTTP Error 403: Forbidden\n",
"https://openai.com/dall-e-2\n",
"https://openai.com/privacy\n",
"https://openai.com/api\n",
"https://openai.com/blog\n",
"https://openai.com/blog/triton\n",
"https://openai.com/blog/authors/philippe\n",
"https://openai.com/jobs/#acceleration\n",
"https://openai.com/blog/robust-adversarial-inputs\n",
"https://openai.com/blog/authors/anish-athalye\n",
"https://openai.com/blog/tags/milestones\n",
"https://openai.com/alignment\n",
"https://openai.com\n",
"https://openai.com/publications\n",
"https://openai.com/charter\n",
"https://openai.com/blog/tags/research\n",
"https://openai.com/fund\n",
"https://openai.com/about\n",
"https://openai.com/timeline\n",
"https://openai.com/careers\n",
"https://openai.com/api/examples\n",
"Unable to parse page https://openai.com/api/examples due to JavaScript being required\n",
"HTTP Error 403: Forbidden\n",
"https://openai.com/api/login\n",
"Unable to parse page https://openai.com/api/login due to JavaScript being required\n",
"HTTP Error 403: Forbidden\n",
"https://openai.com/newsroom\n",
"https://openai.com/api/policies\n",
"https://openai.com/api/pricing\n",
"https://openai.com/contact-sales\n",
"https://openai.com/api/pricing/#faq-fine-tuning-pricing-calculation\n",
"https://openai.com/blog/tags/events\n",
"https://openai.com/blog/chatgpt\n"
]
}
],
"source": [
"import requests\n",
"import re\n",
"import urllib.request\n",
"from bs4 import BeautifulSoup\n",
"from collections import deque\n",
"from html.parser import HTMLParser\n",
"from urllib.parse import urlparse\n",
"import os\n",
"\n",
"# Regex pattern to match a URL\n",
"HTTP_URL_PATTERN = r'^http[s]*://.+'\n",
"\n",
"# Define root domain to crawl\n",
"domain = \"openai.com\"\n",
"full_url = \"https://openai.com/\"\n",
"\n",
"# Create a class to parse the HTML and get the hyperlinks\n",
"class HyperlinkParser(HTMLParser):\n",
" def __init__(self):\n",
" super().__init__()\n",
" # Create a list to store the hyperlinks\n",
" self.hyperlinks = []\n",
"\n",
" # Override the HTMLParser's handle_starttag method to get the hyperlinks\n",
" def handle_starttag(self, tag, attrs):\n",
" attrs = dict(attrs)\n",
"\n",
" # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks\n",
" if tag == \"a\" and \"href\" in attrs:\n",
" self.hyperlinks.append(attrs[\"href\"])\n",
"\n",
"# Function to get the hyperlinks from a URL\n",
"def get_hyperlinks(url):\n",
" \n",
" # Try to open the URL and read the HTML\n",
" try:\n",
" # Open the URL and read the HTML\n",
" with urllib.request.urlopen(url) as response:\n",
"\n",
" # If the response is not HTML, return an empty list\n",
" if not response.info().get('Content-Type').startswith(\"text/html\"):\n",
" return []\n",
" \n",
" # Decode the HTML\n",
" html = response.read().decode('utf-8')\n",
" except Exception as e:\n",
" print(e)\n",
" return []\n",
"\n",
" # Create the HTML Parser and then Parse the HTML to get hyperlinks\n",
" parser = HyperlinkParser()\n",
" parser.feed(html)\n",
"\n",
" return parser.hyperlinks\n",
"\n",
"# Function to get the hyperlinks from a URL that are within the same domain\n",
"def get_domain_hyperlinks(local_domain, url):\n",
" clean_links = []\n",
" for link in set(get_hyperlinks(url)):\n",
" clean_link = None\n",
"\n",
" # If the link is a URL, check if it is within the same domain\n",
" if re.search(HTTP_URL_PATTERN, link):\n",
" # Parse the URL and check if the domain is the same\n",
" url_obj = urlparse(link)\n",
" if url_obj.netloc == local_domain:\n",
" clean_link = link\n",
"\n",
" # If the link is not a URL, check if it is a relative link\n",
" else:\n",
" if link.startswith(\"/\"):\n",
" link = link[1:]\n",
" elif link.startswith(\"#\") or link.startswith(\"mailto:\"):\n",
" continue\n",
" clean_link = \"https://\" + local_domain + \"/\" + link\n",
"\n",
" if clean_link is not None:\n",
" if clean_link.endswith(\"/\"):\n",
" clean_link = clean_link[:-1]\n",
" clean_links.append(clean_link)\n",
"\n",
" # Return the list of hyperlinks that are within the same domain\n",
" return list(set(clean_links))\n",
"\n",
"\n",
"def crawl(url):\n",
" # Parse the URL and get the domain\n",
" local_domain = urlparse(url).netloc\n",
"\n",
" # Create a queue to store the URLs to crawl\n",
" queue = deque([url])\n",
"\n",
" # Create a set to store the URLs that have already been seen (no duplicates)\n",
" seen = set([url])\n",
"\n",
" # Create a directory to store the text files\n",
" if not os.path.exists(\"text/\"):\n",
" os.mkdir(\"text/\")\n",
"\n",
" if not os.path.exists(\"text/\"+local_domain+\"/\"):\n",
" os.mkdir(\"text/\" + local_domain + \"/\")\n",
"\n",
" # Create a directory to store the csv files\n",
" if not os.path.exists(\"processed\"):\n",
" os.mkdir(\"processed\")\n",
"\n",
" # While the queue is not empty, continue crawling\n",
" while queue:\n",
"\n",
" # Get the next URL from the queue\n",
" url = queue.pop()\n",
" print(url) # for debugging and to see the progress\n",
"\n",
" # Save text from the url to a <url>.txt file\n",
" with open('text/'+local_domain+'/'+url[8:].replace(\"/\", \"_\") + \".txt\", \"w\") as f:\n",
"\n",
" # Get the text from the URL using BeautifulSoup\n",
" soup = BeautifulSoup(requests.get(url).text, \"html.parser\")\n",
"\n",
" # Get the text but remove the tags\n",
" text = soup.get_text()\n",
"\n",
" # If the crawler gets to a page that requires JavaScript, it will stop the crawl\n",
" if (\"You need to enable JavaScript to run this app.\" in text):\n",
" print(\"Unable to parse page \" + url + \" due to JavaScript being required\")\n",
" \n",
" # Otherwise, write the text to the file in the text directory\n",
" f.write(text)\n",
"\n",
" # Get the hyperlinks from the URL and add them to the queue\n",
" for link in get_domain_hyperlinks(local_domain, url):\n",
" if link not in seen:\n",
" queue.append(link)\n",
" seen.add(link)\n",
"\n",
"crawl(full_url)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"def remove_newlines(serie):\n",
" serie = serie.str.replace('\\n', ' ')\n",
" serie = serie.str.replace('\\\\n', ' ')\n",
" serie = serie.str.replace(' ', ' ')\n",
" serie = serie.str.replace(' ', ' ')\n",
" return serie"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# Create a list to store the text files\n",
"texts=[]\n",
"\n",
"# Get all the text files in the text directory\n",
"for file in os.listdir(\"text/\" + domain + \"/\"):\n",
"\n",
" # Open the file and read the text\n",
" with open(\"text/\" + domain + \"/\" + file, \"r\") as f:\n",
" text = f.read()\n",
"\n",
" # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.\n",
" texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text))\n",
"\n",
"# Create a dataframe from the list of texts\n",
"df = pd.DataFrame(texts, columns = ['fname', 'text'])\n",
"\n",
"# Set the text column to be the raw text with the newlines removed\n",
"df['text'] = df.fname + \". \" + remove_newlines(df.text)\n",
"df.to_csv('processed/scraped.csv')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot: >"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAikAAAGdCAYAAADXIOPgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/P9b71AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAlRElEQVR4nO3df3RU9Z3/8VcSJhMCTELATEhJEIsFIyAKNcy2da2EBJrjas05iy3HTS0Ht2zwVNOlmi4iP9oTDtuv2tqIPbsW3LOlbOkpuiJiYhRYa/iVmkrAZsWlG3dxkhaaBIhMhuTz/cPv3K9jgjB4yXwmeT7OyTncez/zmc/7PdfJy5m5mSRjjBEAAIBlkuO9AAAAgIEQUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAVhoR7wVcjr6+Pp04cUJjxoxRUlJSvJcDAAAugTFGp0+fVm5urpKTL/46SUKGlBMnTigvLy/eywAAAJfhvffe08SJEy86LiFDypgxYyR9WKTP53Nt3nA4rNraWhUXF8vj8bg2byKhB/RAogcSPRju9Uv0QHK/B11dXcrLy3N+j19MQoaUyFs8Pp/P9ZCSnp4un883rE9IekAP6AE9GO71S/RAunI9uNSPavDBWQAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArjYj3Amw0ffXLCvVe2tdI2+IP60vjvQQAAFzFKykAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABY6VOFlPXr1yspKUkPPPCAs+/cuXOqqKjQuHHjNHr0aJWVlamtrS3qdq2trSotLVV6erqys7O1YsUKnT9//tMsBQAADDGXHVIOHjyon/70p5o5c2bU/gcffFAvvPCCtm3bpj179ujEiRO66667nOO9vb0qLS1VT0+P3njjDT377LPavHmzVq1adflVAACAIeeyQsqZM2e0ePFi/dM//ZPGjh3r7O/s7NQzzzyjxx57TLfddptmz56tTZs26Y033tC+ffskSbW1tTp69Kj+9V//VbNmzdLChQu1bt061dTUqKenx52qAABAwhtxOTeqqKhQaWmpioqK9P3vf9/Z39jYqHA4rKKiImfftGnTlJ+fr4aGBs2dO1cNDQ2aMWOG/H6/M6akpETLli3TkSNHdOONN/a7v1AopFAo5Gx3dXVJksLhsMLh8OWUMKDIXN5k49qcg8WtPkTmcbOviYYe0AOJHgz3+iV6ILnfg1jniTmkbN26Vb/97W918ODBfseCwaBSU1OVmZkZtd/v9ysYDDpjPhpQIscjxwZSXV2tNWvW9NtfW1ur9PT0WEu4qHVz+lyf80rbuXOnq/PV1dW5Ol8iogf0QKIHw71+iR5I7vWgu7s7pvExhZT33ntP3/72t1VXV6e0tLSY7ujTqKqqUmVlpbPd1dWlvLw8FRcXy+fzuXY/4XBYdXV1euRQskJ9Sa7NOxiaV5e4Mk+kB/Pnz5fH43FlzkRDD+iBRA+Ge/0SPZDc70HknZBLFVNIaWxsVHt7u2666SZnX29vr/bu3auf/OQnevnll9XT06OOjo6oV1Pa2tqUk5MjScrJydGBAwei5o1c/RMZ83Fer1der7fffo/Hc0VOnFBfkkK9iRVS3O7DleptIqEH9ECiB8O9fokeSO71INY5Yvrg7Lx583T48GE1NTU5P3PmzNHixYudf3s8HtXX1zu3aWlpUWtrqwKBgCQpEAjo8OHDam9vd8bU1dXJ5/OpoKAgpsUDAIChK6ZXUsaMGaPp06dH7Rs1apTGjRvn7F+yZIkqKyuVlZUln8+n+++/X4FAQHPnzpUkFRcXq6CgQPfcc482bNigYDColStXqqKiYsBXSwAAwPB0WVf3fJLHH39cycnJKisrUygUUklJiZ566inneEpKinbs2KFly5YpEAho1KhRKi8v19q1a91eCgAASGCfOqTs3r07ajstLU01NTWqqam54G0mTZrk+tUoAABgaOG7ewAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFYipAAAACsRUgAAgJUIKQAAwEqEFAAAYCVCCgAAsBIhBQAAWImQAgAArERIAQAAViKkAAAAKxFSAACAlQgpAADASoQUAABgJUIKAACwEiEFAABYiZACAACsREgBAABWIqQAAAArEVIAAICVCCkAAMBKhBQAAGAlQgoAALASIQUAAFiJkAIAAKxESAEAAFaKKaRs3LhRM2fOlM/nk8/nUyAQ0EsvveQcP3funCoqKjRu3DiNHj1aZWVlamtri5qjtbVVpaWlSk9PV3Z2tlasWKHz58+7Uw0AABgyYgopEydO1Pr169XY2KhDhw7ptttu0x133KEjR45Ikh588EG98MIL2rZtm/bs2aMTJ07orrvucm7f29ur0tJS9fT06I033tCzzz6rzZs3a9WqVe5WBQAAEt6IWAbffvvtUds/+MEPtHHjRu3bt08TJ07UM888oy1btui2226TJG3atEnXXXed9u3bp7lz56q2tlZHjx7VK6+8Ir/fr1mzZmndunV66KGHtHr1aqWmprpXGQAASGgxhZSP6u3t1bZt23T27FkFAgE1NjYqHA6rqKjIGTNt2jTl5+eroaFBc+fOVUNDg2bMmCG/3++MKSkp0bJly3TkyBHdeOONA95XKBRSKBRytru6uiRJ4XBY4XD4ckvoJzKXN9m4NudgcasPkXnc7GuioQf0QKIHw71+iR5I7vcg1nliDimHDx9WIBDQuXPnNHr0aG3fvl0FBQVqampSamqqMjMzo8b7/X4Fg0FJUjAYjAookeORYxdSXV2tNWvW9NtfW1ur9PT0WEu4qHVz+lyf80rbuXOnq/PV1dW5Ol8iogf0QKIHw71+iR5I7vWgu7s7pvExh5SpU6eqqalJnZ2d+tWvfqXy8nLt2bMn1mliUlVVpcrKSme7q6tLeXl5Ki4uls/nc+1+wuGw6urq9MihZIX6klybdzA0ry5xZZ5ID+bPny+Px+PKnImGHtADiR4M9/oleiC534PIOyGXKuaQkpqaqilTpkiSZs+erYMHD+pHP/qRFi1apJ6eHnV0dES9mtLW1qacnBxJUk5Ojg4cOBA1X+Tqn8iYgXi9Xnm93n77PR7PFTlxQn1JCvUmVkhxuw9XqreJhB7QA4keDPf6JXogudeDWOf41H8npa+vT6FQSLNnz5bH41F9fb1zrKWlRa2trQoEApKkQCCgw4cPq7293RlTV1cnn8+ngoKCT7sUAAAwhMT0SkpVVZUWLlyo/Px8nT59Wlu2bNHu3bv18ssvKyMjQ0uWLFFlZaWysrLk8/l0//33KxAIaO7cuZKk4uJiFRQU6J577tGGDRsUDAa1cuVKVVRUDPhKCQAAGL5iCint7e36m7/5G73//vvKyMjQzJkz9fLLL2v+/PmSpMcff1zJyckqKytTKBRSSUmJnnrqKef2KSkp2rFjh5YtW6ZAIKBRo0apvLxca9eudbcqAACQ8GIKKc8888wnHk9LS1NNTY1qamouOGbSpEmuX4kCAACGHr67BwAAWImQAgAArERIAQA
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import tiktoken\n",
"\n",
"# Load the cl100k_base tokenizer which is designed to work with the ada-002 model\n",
"tokenizer = tiktoken.get_encoding(\"cl100k_base\")\n",
"\n",
"df = pd.read_csv('processed/scraped.csv', index_col=0)\n",
"df.columns = ['title', 'text']\n",
"\n",
"# Tokenize the text and save the number of tokens to a new column\n",
"df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))\n",
"\n",
"# Visualize the distribution of the number of tokens per row using a histogram\n",
"df.n_tokens.hist()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"max_tokens = 500\n",
"\n",
"# Function to split the text into chunks of a maximum number of tokens\n",
"def split_into_many(text, max_tokens = max_tokens):\n",
"\n",
" # Split the text into sentences\n",
" sentences = text.split('. ')\n",
"\n",
" # Get the number of tokens for each sentence\n",
" n_tokens = [len(tokenizer.encode(\" \" + sentence)) for sentence in sentences]\n",
" \n",
" chunks = []\n",
" tokens_so_far = 0\n",
" chunk = []\n",
"\n",
" # Loop through the sentences and tokens joined together in a tuple\n",
" for sentence, token in zip(sentences, n_tokens):\n",
"\n",
" # If the number of tokens so far plus the number of tokens in the current sentence is greater \n",
" # than the max number of tokens, then add the chunk to the list of chunks and reset\n",
" # the chunk and tokens so far\n",
" if tokens_so_far + token > max_tokens:\n",
" chunks.append(\". \".join(chunk) + \".\")\n",
" chunk = []\n",
" tokens_so_far = 0\n",
"\n",
" # If the number of tokens in the current sentence is greater than the max number of \n",
" # tokens, go to the next sentence\n",
" if token > max_tokens:\n",
" continue\n",
"\n",
" # Otherwise, add the sentence to the chunk and add the number of tokens to the total\n",
" chunk.append(sentence)\n",
" tokens_so_far += token + 1\n",
"\n",
" return chunks\n",
" \n",
"\n",
"shortened = []\n",
"\n",
"# Loop through the dataframe\n",
"for row in df.iterrows():\n",
"\n",
" # If the text is None, go to the next row\n",
" if row[1]['text'] is None:\n",
" continue\n",
"\n",
" # If the number of tokens is greater than the max number of tokens, split the text into chunks\n",
" if row[1]['n_tokens'] > max_tokens:\n",
" shortened += split_into_many(row[1]['text'])\n",
" \n",
" # Otherwise, add the text to the list of shortened texts\n",
" else:\n",
" shortened.append( row[1]['text'] )"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot: >"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAigAAAGgCAYAAACABpytAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/P9b71AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAsTklEQVR4nO3df3TU1Z3/8Vd+TCYEmMSgmSEVIlYrpIhQUDLVbq2ERIzWHzm7/mA1bTl6SoMrxKWaLkIAKy7tFsWNsN1FsGebZUtPoYqIhKBx1fArypYfNtWWNrQyyVYM4UcZJsn9/uE3nzomagYmM3fC83HO58Dn3jt37n07J7z8zHwyScYYIwAAAIskx3sBAAAAH0dAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWiSigXHTRRUpKSupxlJeXS5JOnTql8vJyDRs2TEOGDFFpaalaWlrC5mhublZJSYkyMjKUk5OjuXPnqqOjI3o7AgAACS81ksG7du1SZ2enc75v3z5NnTpVf/u3fytJmjNnjl544QWtW7dOmZmZmjVrlm677Ta9/vrrkqTOzk6VlJTI5/PpjTfe0OHDh3XPPffI5XLpscce6/M6urq69N5772no0KFKSkqKZAsAACBOjDE6duyYcnNzlZz8GddIzFl44IEHzOc//3nT1dVl2trajMvlMuvWrXP63377bSPJNDQ0GGOM2bRpk0lOTjaBQMAZs2LFCuPxeEwwGOzz8x46dMhI4uDg4ODg4EjA49ChQ5/5b31EV1A+6vTp0/rP//xPVVRUKCkpSY2NjQqFQiosLHTGjB49WiNHjlRDQ4MKCgrU0NCgyy+/XF6v1xlTXFysmTNnav/+/ZowYUKvzxUMBhUMBp1z8/+/gPngwYMaOnTomW4hTCgU0ssvv6yvfe1rcrlcUZkTPVHn2KDOsUGdY4dax0Z/1/nYsWMaNWpUn/7tPuOAsmHDBrW1tekb3/iGJCkQCCgtLU1ZWVlh47xerwKBgDPmo+Gku7+775MsWbJECxcu7NHe0NCgjIyMM91CDxkZGdqxY0fU5kPvqHNsUOfYoM6xQ61joz/rfPLkSUnq08czzjigrFq1StOmTVNubu6ZTtFnlZWVqqiocM7b29s1YsQIFRUVyePxROU5QqGQamtrNXXqVNJ5P6LOsUGdY4M6xw61jo3+rnN7e3ufx55RQPnDH/6grVu36he/+IXT5vP5dPr0abW1tYVdRWlpaZHP53PG7Ny5M2yu7rt8usf0xu12y+1292h3uVxRL2B/zImeqHNsUOfYoM6xQ61jo7/qHMmcZ/R7UFavXq2cnByVlJQ4bRMnTpTL5VJdXZ3T1tTUpObmZvn9fkmS3+/X3r171dra6oypra2Vx+NRfn7+mSwFAAAMQBFfQenq6tLq1atVVlam1NS/PjwzM1MzZsxQRUWFsrOz5fF4dP/998vv96ugoECSVFRUpPz8fN19991aunSpAoGA5s2bp/Ly8l6vkAAAgHNTxAFl69atam5u1re+9a0efcuWLVNycrJKS0sVDAZVXFysp59+2ulPSUnRxo0bNXPmTPn9fg0ePFhlZWVatGjR2e0CAAAMKBEHlKKiIuc2349LT09XdXW1qqurP/HxeXl52rRpU6RPCwAAziF8Fw8AALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYJ0z/jZjAADw2S56+IV4L6HP3ClGS6+Sxla9pKbv3xjXtXAFBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYJ+KA8qc//Ul///d/r2HDhmnQoEG6/PLLtXv3bqffGKP58+dr+PDhGjRokAoLC/XOO++EzXHkyBFNnz5dHo9HWVlZmjFjho4fP372uwEAAANCRAHlgw8+0NVXXy2Xy6UXX3xRBw4c0L/8y7/ovPPOc8YsXbpUy5cv18qVK7Vjxw4NHjxYxcXFOnXqlDNm+vTp2r9/v2pra7Vx40a9+uqruu+++6K3KwAAkNBSIxn8z//8zxoxYoRWr17ttI0aNcr5uzFGTzzxhObNm6ebb75ZkvSTn/xEXq9XGzZs0B133KG3335bmzdv1q5duzRp0iRJ0lNPPaUbbrhBP/zhD5Wbm9vjeYPBoILBoHPe3t4uSQqFQgqFQpFs4RN1zxOt+dA76hwb1Dk2qHPsJHKt3Skm3kvoM3eycf7sj1pHMmeSMabPlcvPz1dxcbH++Mc/qr6+Xp/73Of0ne98R/fee68k6Xe/+50+//nP66233tL48eOdx331q1/V+PHj9eSTT+qZZ57Rgw8+qA8++MDp7+joUHp6utatW6dbb721x/NWVVVp4cKFPdpramqUkZHR580CAID4OXnypO666y4dPXpUHo/nU8dGdAXld7/7nVasWKGKigp973vf065du/QP//APSktLU1lZmQKBgCTJ6/WGPc7r9Tp9gUBAOTk54YtITVV2drYz5uMqKytVUVHhnLe3t2vEiBEqKir6zA32VSgUUm1traZOnSqXyxWVOdETdY4N6hwb1Dl2ErnWY6teivcS+sydbLR4Upce2Z2sxvnXR33+7ndA+iKigNLV1aVJkybpsccekyRNmDBB+/bt08qVK1VWVhbZKiPgdrvldrt7tLtcrqi/UPtjTvREnWODOscGdY6dRKx1sDMp3kuIWLArqV/qHMmcEX1Idvjw4crPzw9rGzNmjJqbmyVJPp9PktTS0hI2pqWlxenz+XxqbW0N6+/o6NCRI0ecMQAA4NwWUUC5+uqr1dTUFNb2m9/8Rnl5eZI+/MCsz+dTXV2d09/e3q4dO3bI7/dLkvx+v9ra2tTY2OiM2bZtm7q6ujR58uQz3ggAABg4InqLZ86cOfryl7+sxx57TH/3d3+nnTt36sc//rF+/OMfS5KSkpI0e/ZsPfroo7r00ks1atQoPfLII8rNzdUtt9wi6cMrLtdff73uvfderVy5UqFQSLNmzdIdd9zR6x08AADg3BNRQLnyyiu1fv16VVZWatGiRRo1apSeeOIJTZ8+3Rnz3e9+VydOnNB9992ntrY2XXPNNdq8ebPS09OdMT/96U81a9YsTZkyRcnJySotLdXy5cujtysAAJDQIgooknTjjTfqxhtv/MT+pKQkLVq0SIsWLfrEMdnZ2aqpqYn0qQEAwDmC7+IBAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1iGgAAAA6xBQAACAdQgoAADAOgQUAABgHQIKAACwDgEFAABYh4ACAACsQ0ABAADWIaAAAADrEFAAAIB1CCgAAMA6BBQAAGAdAgoAALAOAQUAAFiHgAIAAKxDQAEAANYhoAAAAOsQUAAAgHUIKAAAwDoEFAAAYB0CCgAAsA4BBQAAWIeAAgAArENAAQAA1okooFRVVSkpKSnsGD16tNN/6tQplZeXa9iwYRoyZIhKS0vV0tISNkdzc7NKSkqUkZGhnJwczZ07Vx0dHdHZDQAAGBBSI33AF7/4RW3duvWvE6T+dYo5c+bohRde0Lp165SZmalZs2bptttu0+uvvy5J6uzsVElJiXw+n9544w0dPnxY99xzj1wulx577LEobAcAAAwEEQeU1NRU+Xy+Hu1Hjx7VqlWrVFNTo+uuu06StHr1ao0ZM0bbt29XQUGBtmzZogMHDmjr1q3yer0aP368Fi9erIceekhVVVVKS0s7+x0BAICEF3FAeeedd5Sbm6v09HT
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = pd.DataFrame(shortened, columns = ['text'])\n",
"df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))\n",
"df.n_tokens.hist()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>n_tokens</th>\n",
" <th>embeddings</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>blog authors maddie. Maddie Hall - OpenAI ...</td>\n",
" <td>175</td>\n",
" <td>[-0.012958061881363392, -0.006103983614593744,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>blog authors tom. Tom Brown - OpenAI ...</td>\n",
" <td>228</td>\n",
" <td>[-0.0053874170407652855, -0.009962032549083233...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>blog openai scholars 2019 final projects. Op...</td>\n",
" <td>492</td>\n",
" <td>[0.0019150723237544298, -0.0070442273281514645...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>In this project, I used curiosity-driven explo...</td>\n",
" <td>478</td>\n",
" <td>[-0.0067560747265815735, 0.0004431474662851542...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Results revealed that the optimal RL policies ...</td>\n",
" <td>499</td>\n",
" <td>[-0.012868616729974747, 0.0029640409629791975,...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text n_tokens \\\n",
"0 blog authors maddie. Maddie Hall - OpenAI ... 175 \n",
"1 blog authors tom. Tom Brown - OpenAI ... 228 \n",
"2 blog openai scholars 2019 final projects. Op... 492 \n",
"3 In this project, I used curiosity-driven explo... 478 \n",
"4 Results revealed that the optimal RL policies ... 499 \n",
"\n",
" embeddings \n",
"0 [-0.012958061881363392, -0.006103983614593744,... \n",
"1 [-0.0053874170407652855, -0.009962032549083233... \n",
"2 [0.0019150723237544298, -0.0070442273281514645... \n",
"3 [-0.0067560747265815735, 0.0004431474662851542... \n",
"4 [-0.012868616729974747, 0.0029640409629791975,... "
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import openai\n",
"\n",
"df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])\n",
"df.to_csv('processed/embeddings.csv')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>n_tokens</th>\n",
" <th>embeddings</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>blog authors maddie. Maddie Hall - OpenAI ...</td>\n",
" <td>175</td>\n",
" <td>[-0.012958061881363392, -0.006103983614593744,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>blog authors tom. Tom Brown - OpenAI ...</td>\n",
" <td>228</td>\n",
" <td>[-0.0053874170407652855, -0.009962032549083233...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>blog openai scholars 2019 final projects. Op...</td>\n",
" <td>492</td>\n",
" <td>[0.0019150723237544298, -0.0070442273281514645...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>In this project, I used curiosity-driven explo...</td>\n",
" <td>478</td>\n",
" <td>[-0.0067560747265815735, 0.0004431474662851542...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Results revealed that the optimal RL policies ...</td>\n",
" <td>499</td>\n",
" <td>[-0.012868616729974747, 0.0029640409629791975,...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text n_tokens \\\n",
"0 blog authors maddie. Maddie Hall - OpenAI ... 175 \n",
"1 blog authors tom. Tom Brown - OpenAI ... 228 \n",
"2 blog openai scholars 2019 final projects. Op... 492 \n",
"3 In this project, I used curiosity-driven explo... 478 \n",
"4 Results revealed that the optimal RL policies ... 499 \n",
"\n",
" embeddings \n",
"0 [-0.012958061881363392, -0.006103983614593744,... \n",
"1 [-0.0053874170407652855, -0.009962032549083233... \n",
"2 [0.0019150723237544298, -0.0070442273281514645... \n",
"3 [-0.0067560747265815735, 0.0004431474662851542... \n",
"4 [-0.012868616729974747, 0.0029640409629791975,... "
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from openai.embeddings_utils import distances_from_embeddings\n",
"\n",
2023-02-04 19:56:53 +00:00
"df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])\n",
2023-02-03 21:42:19 +00:00
"\n",
"df.to_csv('processed/embeddings.csv')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>n_tokens</th>\n",
" <th>embeddings</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>blog authors maddie. Maddie Hall - OpenAI ...</td>\n",
" <td>175</td>\n",
" <td>[-0.012958061881363392, -0.006103983614593744,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>blog authors tom. Tom Brown - OpenAI ...</td>\n",
" <td>228</td>\n",
" <td>[-0.0053874170407652855, -0.009962032549083233...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>blog openai scholars 2019 final projects. Op...</td>\n",
" <td>492</td>\n",
" <td>[0.0019150723237544298, -0.0070442273281514645...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>In this project, I used curiosity-driven explo...</td>\n",
" <td>478</td>\n",
" <td>[-0.0067560747265815735, 0.0004431474662851542...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Results revealed that the optimal RL policies ...</td>\n",
" <td>499</td>\n",
" <td>[-0.012868616729974747, 0.0029640409629791975,...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text n_tokens \\\n",
"0 blog authors maddie. Maddie Hall - OpenAI ... 175 \n",
"1 blog authors tom. Tom Brown - OpenAI ... 228 \n",
"2 blog openai scholars 2019 final projects. Op... 492 \n",
"3 In this project, I used curiosity-driven explo... 478 \n",
"4 Results revealed that the optimal RL policies ... 499 \n",
"\n",
" embeddings \n",
"0 [-0.012958061881363392, -0.006103983614593744,... \n",
"1 [-0.0053874170407652855, -0.009962032549083233... \n",
"2 [0.0019150723237544298, -0.0070442273281514645... \n",
"3 [-0.0067560747265815735, 0.0004431474662851542... \n",
"4 [-0.012868616729974747, 0.0029640409629791975,... "
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from openai.embeddings_utils import distances_from_embeddings, cosine_similarity\n",
"\n",
"df=pd.read_csv('processed/embeddings.csv', index_col=0)\n",
"df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'No, you are not allowed to publish model outputs to Twitter without a human review. You must manually review each generation before sharing or while streaming, and indicate that the content is AI-generated in a way no user could reasonably miss or misunderstand.'"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def create_context(\n",
" question, df, max_len=1800, size=\"ada\"\n",
"):\n",
" \"\"\"\n",
" Create a context for a question by finding the most similar context from the dataframe\n",
" \"\"\"\n",
"\n",
" # Get the embeddings for the question\n",
" q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']\n",
"\n",
" # Get the distances from the embeddings\n",
" df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')\n",
"\n",
"\n",
" returns = []\n",
" cur_len = 0\n",
"\n",
" # Sort by distance and add the text to the context until the context is too long\n",
" for i, row in df.sort_values('distances', ascending=True).iterrows():\n",
" \n",
" # Add the length of the text to the current length\n",
" cur_len += row['n_tokens'] + 4\n",
" \n",
" # If the context is too long, break\n",
" if cur_len > max_len:\n",
" break\n",
" \n",
" # Else add it to the text that is being returned\n",
" returns.append(row[\"text\"])\n",
"\n",
" # Return the context\n",
" return \"\\n\\n###\\n\\n\".join(returns)\n",
"\n",
"def answer_question(\n",
" df,\n",
" model=\"text-davinci-003\",\n",
" question=\"Am I allowed to publish model outputs to Twitter, without a human review?\",\n",
" max_len=1800,\n",
" size=\"ada\",\n",
" debug=False,\n",
" max_tokens=150,\n",
" stop_sequence=None\n",
"):\n",
" \"\"\"\n",
" Answer a question based on the most similar context from the dataframe texts\n",
" \"\"\"\n",
" context = create_context(\n",
" question,\n",
" df,\n",
" max_len=max_len,\n",
" size=size,\n",
" )\n",
" # If debug, print the raw model response\n",
" if debug:\n",
" print(\"Context:\\n\" + context)\n",
" print(\"\\n\\n\")\n",
"\n",
" try:\n",
" # Create a completions using the question and context\n",
" response = openai.Completion.create(\n",
" prompt=f\"Answer the question based on the context below, and if the question can't be answered based on the context, say \\\"I don't know\\\"\\n\\nContext: {context}\\n\\n---\\n\\nQuestion: {question}\\nAnswer:\",\n",
" temperature=0,\n",
" max_tokens=max_tokens,\n",
" top_p=1,\n",
" frequency_penalty=0,\n",
" presence_penalty=0,\n",
" stop=stop_sequence,\n",
" model=model,\n",
" )\n",
" return response[\"choices\"][0][\"text\"].strip()\n",
" except Exception as e:\n",
" print(e)\n",
" return \"\""
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"I don't know.\""
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"answer_question(df, question=\"What day is it?\", debug=False)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'The newest embeddings model is text-embedding-ada-002.'"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"answer_question(df, question=\"What is our newest embeddings model?\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "05f34a34d73b71652304030c1097be3a5720ea2447153dd6542d145a26b73181"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}