2023-02-03 21:42:19 +00:00
################################################################################
### Step 1
################################################################################
import requests
import re
import urllib . request
from bs4 import BeautifulSoup
from collections import deque
from html . parser import HTMLParser
from urllib . parse import urlparse
import os
import pandas as pd
import tiktoken
import openai
import numpy as np
from openai . embeddings_utils import distances_from_embeddings , cosine_similarity
# Regex pattern to match a URL
HTTP_URL_PATTERN = r ' ^http[s]*://.+ '
# Define root domain to crawl
domain = " openai.com "
full_url = " https://openai.com/ "
# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser ( HTMLParser ) :
def __init__ ( self ) :
super ( ) . __init__ ( )
# Create a list to store the hyperlinks
self . hyperlinks = [ ]
# Override the HTMLParser's handle_starttag method to get the hyperlinks
def handle_starttag ( self , tag , attrs ) :
attrs = dict ( attrs )
# If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
if tag == " a " and " href " in attrs :
self . hyperlinks . append ( attrs [ " href " ] )
################################################################################
### Step 2
################################################################################
# Function to get the hyperlinks from a URL
def get_hyperlinks ( url ) :
# Try to open the URL and read the HTML
try :
# Open the URL and read the HTML
with urllib . request . urlopen ( url ) as response :
# If the response is not HTML, return an empty list
if not response . info ( ) . get ( ' Content-Type ' ) . startswith ( " text/html " ) :
return [ ]
# Decode the HTML
html = response . read ( ) . decode ( ' utf-8 ' )
except Exception as e :
print ( e )
return [ ]
# Create the HTML Parser and then Parse the HTML to get hyperlinks
parser = HyperlinkParser ( )
parser . feed ( html )
return parser . hyperlinks
################################################################################
### Step 3
################################################################################
# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks ( local_domain , url ) :
clean_links = [ ]
for link in set ( get_hyperlinks ( url ) ) :
clean_link = None
# If the link is a URL, check if it is within the same domain
if re . search ( HTTP_URL_PATTERN , link ) :
# Parse the URL and check if the domain is the same
url_obj = urlparse ( link )
if url_obj . netloc == local_domain :
clean_link = link
# If the link is not a URL, check if it is a relative link
else :
if link . startswith ( " / " ) :
link = link [ 1 : ]
elif link . startswith ( " # " ) or link . startswith ( " mailto: " ) :
continue
clean_link = " https:// " + local_domain + " / " + link
if clean_link is not None :
if clean_link . endswith ( " / " ) :
clean_link = clean_link [ : - 1 ]
clean_links . append ( clean_link )
# Return the list of hyperlinks that are within the same domain
return list ( set ( clean_links ) )
################################################################################
### Step 4
################################################################################
def crawl ( url ) :
# Parse the URL and get the domain
local_domain = urlparse ( url ) . netloc
# Create a queue to store the URLs to crawl
queue = deque ( [ url ] )
# Create a set to store the URLs that have already been seen (no duplicates)
seen = set ( [ url ] )
# Create a directory to store the text files
if not os . path . exists ( " text/ " ) :
os . mkdir ( " text/ " )
if not os . path . exists ( " text/ " + local_domain + " / " ) :
os . mkdir ( " text/ " + local_domain + " / " )
# Create a directory to store the csv files
if not os . path . exists ( " processed " ) :
os . mkdir ( " processed " )
# While the queue is not empty, continue crawling
while queue :
# Get the next URL from the queue
url = queue . pop ( )
print ( url ) # for debugging and to see the progress
# Save text from the url to a <url>.txt file
2023-02-06 17:28:31 +00:00
with open ( ' text/ ' + local_domain + ' / ' + url [ 8 : ] . replace ( " / " , " _ " ) + " .txt " , " w " , encoding = " UTF-8 " ) as f :
2023-02-03 21:42:19 +00:00
# Get the text from the URL using BeautifulSoup
soup = BeautifulSoup ( requests . get ( url ) . text , " html.parser " )
# Get the text but remove the tags
text = soup . get_text ( )
# If the crawler gets to a page that requires JavaScript, it will stop the crawl
if ( " You need to enable JavaScript to run this app. " in text ) :
print ( " Unable to parse page " + url + " due to JavaScript being required " )
# Otherwise, write the text to the file in the text directory
f . write ( text )
# Get the hyperlinks from the URL and add them to the queue
for link in get_domain_hyperlinks ( local_domain , url ) :
if link not in seen :
queue . append ( link )
seen . add ( link )
crawl ( full_url )
################################################################################
### Step 5
################################################################################
def remove_newlines ( serie ) :
serie = serie . str . replace ( ' \n ' , ' ' )
serie = serie . str . replace ( ' \\ n ' , ' ' )
serie = serie . str . replace ( ' ' , ' ' )
serie = serie . str . replace ( ' ' , ' ' )
return serie
################################################################################
### Step 6
################################################################################
# Create a list to store the text files
texts = [ ]
# Get all the text files in the text directory
for file in os . listdir ( " text/ " + domain + " / " ) :
# Open the file and read the text
2023-02-06 17:28:31 +00:00
with open ( " text/ " + domain + " / " + file , " r " , encoding = " UTF-8 " ) as f :
2023-02-03 21:42:19 +00:00
text = f . read ( )
# Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.
texts . append ( ( file [ 11 : - 4 ] . replace ( ' - ' , ' ' ) . replace ( ' _ ' , ' ' ) . replace ( ' #update ' , ' ' ) , text ) )
# Create a dataframe from the list of texts
df = pd . DataFrame ( texts , columns = [ ' fname ' , ' text ' ] )
# Set the text column to be the raw text with the newlines removed
df [ ' text ' ] = df . fname + " . " + remove_newlines ( df . text )
df . to_csv ( ' processed/scraped.csv ' )
df . head ( )
################################################################################
### Step 7
################################################################################
# Load the cl100k_base tokenizer which is designed to work with the ada-002 model
tokenizer = tiktoken . get_encoding ( " cl100k_base " )
df = pd . read_csv ( ' processed/scraped.csv ' , index_col = 0 )
df . columns = [ ' title ' , ' text ' ]
# Tokenize the text and save the number of tokens to a new column
df [ ' n_tokens ' ] = df . text . apply ( lambda x : len ( tokenizer . encode ( x ) ) )
# Visualize the distribution of the number of tokens per row using a histogram
df . n_tokens . hist ( )
################################################################################
### Step 8
################################################################################
max_tokens = 500
# Function to split the text into chunks of a maximum number of tokens
def split_into_many ( text , max_tokens = max_tokens ) :
# Split the text into sentences
sentences = text . split ( ' . ' )
# Get the number of tokens for each sentence
n_tokens = [ len ( tokenizer . encode ( " " + sentence ) ) for sentence in sentences ]
chunks = [ ]
tokens_so_far = 0
chunk = [ ]
# Loop through the sentences and tokens joined together in a tuple
for sentence , token in zip ( sentences , n_tokens ) :
# If the number of tokens so far plus the number of tokens in the current sentence is greater
# than the max number of tokens, then add the chunk to the list of chunks and reset
# the chunk and tokens so far
if tokens_so_far + token > max_tokens :
chunks . append ( " . " . join ( chunk ) + " . " )
chunk = [ ]
tokens_so_far = 0
# If the number of tokens in the current sentence is greater than the max number of
# tokens, go to the next sentence
if token > max_tokens :
continue
# Otherwise, add the sentence to the chunk and add the number of tokens to the total
chunk . append ( sentence )
tokens_so_far + = token + 1
2023-02-19 02:00:27 +00:00
# Add the last chunk to the list of chunks
if chunk :
chunks . append ( " . " . join ( chunk ) + " . " )
2023-02-03 21:42:19 +00:00
return chunks
shortened = [ ]
# Loop through the dataframe
for row in df . iterrows ( ) :
# If the text is None, go to the next row
if row [ 1 ] [ ' text ' ] is None :
continue
# If the number of tokens is greater than the max number of tokens, split the text into chunks
if row [ 1 ] [ ' n_tokens ' ] > max_tokens :
shortened + = split_into_many ( row [ 1 ] [ ' text ' ] )
# Otherwise, add the text to the list of shortened texts
else :
shortened . append ( row [ 1 ] [ ' text ' ] )
################################################################################
### Step 9
################################################################################
df = pd . DataFrame ( shortened , columns = [ ' text ' ] )
df [ ' n_tokens ' ] = df . text . apply ( lambda x : len ( tokenizer . encode ( x ) ) )
df . n_tokens . hist ( )
################################################################################
### Step 10
################################################################################
2023-02-17 12:16:14 +00:00
# Note that you may run into rate limit issues depending on how many files you try to embed
# Please check out our rate limit guide to learn more on how to handle this: https://platform.openai.com/docs/guides/rate-limits
2023-02-03 21:42:19 +00:00
df [ ' embeddings ' ] = df . text . apply ( lambda x : openai . Embedding . create ( input = x , engine = ' text-embedding-ada-002 ' ) [ ' data ' ] [ 0 ] [ ' embedding ' ] )
df . to_csv ( ' processed/embeddings.csv ' )
df . head ( )
################################################################################
### Step 11
################################################################################
df = pd . read_csv ( ' processed/embeddings.csv ' , index_col = 0 )
df [ ' embeddings ' ] = df [ ' embeddings ' ] . apply ( eval ) . apply ( np . array )
df . head ( )
################################################################################
### Step 12
################################################################################
def create_context (
question , df , max_len = 1800 , size = " ada "
) :
"""
Create a context for a question by finding the most similar context from the dataframe
"""
# Get the embeddings for the question
q_embeddings = openai . Embedding . create ( input = question , engine = ' text-embedding-ada-002 ' ) [ ' data ' ] [ 0 ] [ ' embedding ' ]
# Get the distances from the embeddings
df [ ' distances ' ] = distances_from_embeddings ( q_embeddings , df [ ' embeddings ' ] . values , distance_metric = ' cosine ' )
returns = [ ]
cur_len = 0
# Sort by distance and add the text to the context until the context is too long
for i , row in df . sort_values ( ' distances ' , ascending = True ) . iterrows ( ) :
# Add the length of the text to the current length
cur_len + = row [ ' n_tokens ' ] + 4
# If the context is too long, break
if cur_len > max_len :
break
# Else add it to the text that is being returned
returns . append ( row [ " text " ] )
# Return the context
return " \n \n ### \n \n " . join ( returns )
def answer_question (
df ,
model = " text-davinci-003 " ,
question = " Am I allowed to publish model outputs to Twitter, without a human review? " ,
max_len = 1800 ,
size = " ada " ,
debug = False ,
max_tokens = 150 ,
stop_sequence = None
) :
"""
Answer a question based on the most similar context from the dataframe texts
"""
context = create_context (
question ,
df ,
max_len = max_len ,
size = size ,
)
# If debug, print the raw model response
if debug :
print ( " Context: \n " + context )
print ( " \n \n " )
try :
# Create a completions using the questin and context
response = openai . Completion . create (
prompt = f " Answer the question based on the context below, and if the question can ' t be answered based on the context, say \" I don ' t know \" \n \n Context: { context } \n \n --- \n \n Question: { question } \n Answer: " ,
temperature = 0 ,
max_tokens = max_tokens ,
top_p = 1 ,
frequency_penalty = 0 ,
presence_penalty = 0 ,
stop = stop_sequence ,
model = model ,
)
return response [ " choices " ] [ 0 ] [ " text " ] . strip ( )
except Exception as e :
print ( e )
return " "
################################################################################
2023-02-03 21:43:22 +00:00
### Step 13
2023-02-03 21:42:19 +00:00
################################################################################
print ( answer_question ( df , question = " What day is it? " , debug = False ) )
2023-02-03 22:55:06 +00:00
print ( answer_question ( df , question = " What is our newest embeddings model? " ) )