openai-cookbook/apps/web-crawl-q-and-a/web-qa.py

################################################################################
### Step 1
################################################################################

import requests
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
import os
import pandas as pd
import tiktoken
import openai
import numpy as np
from openai.embeddings_utils import distances_from_embeddings, cosine_similarity

# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]*://.+'

# Define root domain to crawl
domain = "openai.com"
full_url = "https://openai.com/"

# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        # Create a list to store the hyperlinks
        self.hyperlinks = []

    # Override the HTMLParser's handle_starttag method to get the hyperlinks
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)

        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])

################################################################################
### Step 2
################################################################################

# Function to get the hyperlinks from a URL
def get_hyperlinks(url):
    
    # Try to open the URL and read the HTML
    try:
        # Open the URL and read the HTML
        with urllib.request.urlopen(url) as response:

            # If the response is not HTML, return an empty list
            if not response.info().get('Content-Type').startswith("text/html"):
                return []
            
            # Decode the HTML
            html = response.read().decode('utf-8')
    except Exception as e:
        print(e)
        return []

    # Create the HTML Parser and then Parse the HTML to get hyperlinks
    parser = HyperlinkParser()
    parser.feed(html)

    return parser.hyperlinks

################################################################################
### Step 3
################################################################################

# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain, url):
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

        # If the link is a URL, check if it is within the same domain
        if re.search(HTTP_URL_PATTERN, link):
            # Parse the URL and check if the domain is the same
            url_obj = urlparse(link)
            if url_obj.netloc == local_domain:
                clean_link = link

        # If the link is not a URL, check if it is a relative link
        else:
            if link.startswith("/"):
                link = link[1:]
            elif link.startswith("#") or link.startswith("mailto:"):
                continue
            clean_link = "https://" + local_domain + "/" + link

        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            clean_links.append(clean_link)

    # Return the list of hyperlinks that are within the same domain
    return list(set(clean_links))


################################################################################
### Step 4
################################################################################

def crawl(url):
    # Parse the URL and get the domain
    local_domain = urlparse(url).netloc

    # Create a queue to store the URLs to crawl
    queue = deque([url])

    # Create a set to store the URLs that have already been seen (no duplicates)
    seen = set([url])

    # Create a directory to store the text files
    if not os.path.exists("text/"):
            os.mkdir("text/")

    if not os.path.exists("text/"+local_domain+"/"):
            os.mkdir("text/" + local_domain + "/")

    # Create a directory to store the csv files
    if not os.path.exists("processed"):
            os.mkdir("processed")

    # While the queue is not empty, continue crawling
    while queue:

        # Get the next URL from the queue
        url = queue.pop()
        print(url) # for debugging and to see the progress

        # Save text from the url to a <url>.txt file
        with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:

            # Get the text from the URL using BeautifulSoup
            soup = BeautifulSoup(requests.get(url).text, "html.parser")

            # Get the text but remove the tags
            text = soup.get_text()

            # If the crawler gets to a page that requires JavaScript, it will stop the crawl
            if ("You need to enable JavaScript to run this app." in text):
                print("Unable to parse page " + url + " due to JavaScript being required")
            
            # Otherwise, write the text to the file in the text directory
            f.write(text)

        # Get the hyperlinks from the URL and add them to the queue
        for link in get_domain_hyperlinks(local_domain, url):
            if link not in seen:
                queue.append(link)
                seen.add(link)

crawl(full_url)

################################################################################
### Step 5
################################################################################

def remove_newlines(serie):
    serie = serie.str.replace('\n', ' ')
    serie = serie.str.replace('\\n', ' ')
    serie = serie.str.replace('  ', ' ')
    serie = serie.str.replace('  ', ' ')
    return serie


################################################################################
### Step 6
################################################################################

# Create a list to store the text files
texts=[]

# Get all the text files in the text directory
for file in os.listdir("text/" + domain + "/"):

    # Open the file and read the text
    with open("text/" + domain + "/" + file, "r", encoding="UTF-8") as f:
        text = f.read()

        # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.
        texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text))

# Create a dataframe from the list of texts
df = pd.DataFrame(texts, columns = ['fname', 'text'])

# Set the text column to be the raw text with the newlines removed
df['text'] = df.fname + ". " + remove_newlines(df.text)
df.to_csv('processed/scraped.csv')
df.head()

################################################################################
### Step 7
################################################################################

# Load the cl100k_base tokenizer which is designed to work with the ada-002 model
tokenizer = tiktoken.get_encoding("cl100k_base")

df = pd.read_csv('processed/scraped.csv', index_col=0)
df.columns = ['title', 'text']

# Tokenize the text and save the number of tokens to a new column
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))

# Visualize the distribution of the number of tokens per row using a histogram
df.n_tokens.hist()

################################################################################
### Step 8
################################################################################

max_tokens = 500

# Function to split the text into chunks of a maximum number of tokens
def split_into_many(text, max_tokens = max_tokens):

    # Split the text into sentences
    sentences = text.split('. ')

    # Get the number of tokens for each sentence
    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
    
    chunks = []
    tokens_so_far = 0
    chunk = []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater 
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            chunk = []
            tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of 
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1
        
    # Add the last chunk to the list of chunks
    if chunk:
        chunks.append(". ".join(chunk) + ".")

    return chunks
    

shortened = []

# Loop through the dataframe
for row in df.iterrows():

    # If the text is None, go to the next row
    if row[1]['text'] is None:
        continue

    # If the number of tokens is greater than the max number of tokens, split the text into chunks
    if row[1]['n_tokens'] > max_tokens:
        shortened += split_into_many(row[1]['text'])
    
    # Otherwise, add the text to the list of shortened texts
    else:
        shortened.append( row[1]['text'] )

################################################################################
### Step 9
################################################################################

df = pd.DataFrame(shortened, columns = ['text'])
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
df.n_tokens.hist()

################################################################################
### Step 10
################################################################################

# Note that you may run into rate limit issues depending on how many files you try to embed
# Please check out our rate limit guide to learn more on how to handle this: https://platform.openai.com/docs/guides/rate-limits

df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])
df.to_csv('processed/embeddings.csv')
df.head()

################################################################################
### Step 11
################################################################################

df=pd.read_csv('processed/embeddings.csv', index_col=0)
df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)

df.head()

################################################################################
### Step 12
################################################################################

def create_context(
    question, df, max_len=1800, size="ada"
):
    """
    Create a context for a question by finding the most similar context from the dataframe
    """

    # Get the embeddings for the question
    q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']

    # Get the distances from the embeddings
    df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')


    returns = []
    cur_len = 0

    # Sort by distance and add the text to the context until the context is too long
    for i, row in df.sort_values('distances', ascending=True).iterrows():
        
        # Add the length of the text to the current length
        cur_len += row['n_tokens'] + 4
        
        # If the context is too long, break
        if cur_len > max_len:
            break
        
        # Else add it to the text that is being returned
        returns.append(row["text"])

    # Return the context
    return "\n\n###\n\n".join(returns)

def answer_question(
    df,
    model="text-davinci-003",
    question="Am I allowed to publish model outputs to Twitter, without a human review?",
    max_len=1800,
    size="ada",
    debug=False,
    max_tokens=150,
    stop_sequence=None
):
    """
    Answer a question based on the most similar context from the dataframe texts
    """
    context = create_context(
        question,
        df,
        max_len=max_len,
        size=size,
    )
    # If debug, print the raw model response
    if debug:
        print("Context:\n" + context)
        print("\n\n")

    try:
        # Create a completions using the questin and context
        response = openai.Completion.create(
            prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:",
            temperature=0,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=stop_sequence,
            model=model,
        )
        return response["choices"][0]["text"].strip()
    except Exception as e:
        print(e)
        return ""

################################################################################
### Step 13
################################################################################

print(answer_question(df, question="What day is it?", debug=False))

print(answer_question(df, question="What is our newest embeddings model?"))
Add in web crawl Q&A tutorial 2023-02-03 21:42:19 +00:00			`################################################################################`
			`### Step 1`
			`################################################################################`

			`import requests`
			`import re`
			`import urllib.request`
			`from bs4 import BeautifulSoup`
			`from collections import deque`
			`from html.parser import HTMLParser`
			`from urllib.parse import urlparse`
			`import os`
			`import pandas as pd`
			`import tiktoken`
			`import openai`
			`import numpy as np`
			`from openai.embeddings_utils import distances_from_embeddings, cosine_similarity`

			`# Regex pattern to match a URL`
			`HTTP_URL_PATTERN = r'^http[s]*://.+'`

			`# Define root domain to crawl`
			`domain = "openai.com"`
			`full_url = "https://openai.com/"`

			`# Create a class to parse the HTML and get the hyperlinks`
			`class HyperlinkParser(HTMLParser):`
			`def __init__(self):`
			`super().__init__()`
			`# Create a list to store the hyperlinks`
			`self.hyperlinks = []`

			`# Override the HTMLParser's handle_starttag method to get the hyperlinks`
			`def handle_starttag(self, tag, attrs):`
			`attrs = dict(attrs)`

			`# If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks`
			`if tag == "a" and "href" in attrs:`
			`self.hyperlinks.append(attrs["href"])`

			`################################################################################`
			`### Step 2`
			`################################################################################`

			`# Function to get the hyperlinks from a URL`
			`def get_hyperlinks(url):`

			`# Try to open the URL and read the HTML`
			`try:`
			`# Open the URL and read the HTML`
			`with urllib.request.urlopen(url) as response:`

			`# If the response is not HTML, return an empty list`
			`if not response.info().get('Content-Type').startswith("text/html"):`
			`return []`

			`# Decode the HTML`
			`html = response.read().decode('utf-8')`
			`except Exception as e:`
			`print(e)`
			`return []`

			`# Create the HTML Parser and then Parse the HTML to get hyperlinks`
			`parser = HyperlinkParser()`
			`parser.feed(html)`

			`return parser.hyperlinks`

			`################################################################################`
			`### Step 3`
			`################################################################################`

			`# Function to get the hyperlinks from a URL that are within the same domain`
			`def get_domain_hyperlinks(local_domain, url):`
			`clean_links = []`
			`for link in set(get_hyperlinks(url)):`
			`clean_link = None`

			`# If the link is a URL, check if it is within the same domain`
			`if re.search(HTTP_URL_PATTERN, link):`
			`# Parse the URL and check if the domain is the same`
			`url_obj = urlparse(link)`
			`if url_obj.netloc == local_domain:`
			`clean_link = link`

			`# If the link is not a URL, check if it is a relative link`
			`else:`
			`if link.startswith("/"):`
			`link = link[1:]`
			`elif link.startswith("#") or link.startswith("mailto:"):`
			`continue`
			`clean_link = "https://" + local_domain + "/" + link`

			`if clean_link is not None:`
			`if clean_link.endswith("/"):`
			`clean_link = clean_link[:-1]`
			`clean_links.append(clean_link)`

			`# Return the list of hyperlinks that are within the same domain`
			`return list(set(clean_links))`


			`################################################################################`
			`### Step 4`
			`################################################################################`

			`def crawl(url):`
			`# Parse the URL and get the domain`
			`local_domain = urlparse(url).netloc`

			`# Create a queue to store the URLs to crawl`
			`queue = deque([url])`

			`# Create a set to store the URLs that have already been seen (no duplicates)`
			`seen = set([url])`

			`# Create a directory to store the text files`
			`if not os.path.exists("text/"):`
			`os.mkdir("text/")`

			`if not os.path.exists("text/"+local_domain+"/"):`
			`os.mkdir("text/" + local_domain + "/")`

			`# Create a directory to store the csv files`
			`if not os.path.exists("processed"):`
			`os.mkdir("processed")`

			`# While the queue is not empty, continue crawling`
			`while queue:`

			`# Get the next URL from the queue`
			`url = queue.pop()`
			`print(url) # for debugging and to see the progress`

			`# Save text from the url to a <url>.txt file`
Update web-qa.py 2023-02-06 17:28:31 +00:00			`with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:`
Add in web crawl Q&A tutorial 2023-02-03 21:42:19 +00:00
			`# Get the text from the URL using BeautifulSoup`
			`soup = BeautifulSoup(requests.get(url).text, "html.parser")`

			`# Get the text but remove the tags`
			`text = soup.get_text()`

			`# If the crawler gets to a page that requires JavaScript, it will stop the crawl`
			`if ("You need to enable JavaScript to run this app." in text):`
			`print("Unable to parse page " + url + " due to JavaScript being required")`

			`# Otherwise, write the text to the file in the text directory`
			`f.write(text)`

			`# Get the hyperlinks from the URL and add them to the queue`
			`for link in get_domain_hyperlinks(local_domain, url):`
			`if link not in seen:`
			`queue.append(link)`
			`seen.add(link)`

			`crawl(full_url)`

			`################################################################################`
			`### Step 5`
			`################################################################################`

			`def remove_newlines(serie):`
			`serie = serie.str.replace('\n', ' ')`
			`serie = serie.str.replace('\\n', ' ')`
			`serie = serie.str.replace(' ', ' ')`
			`serie = serie.str.replace(' ', ' ')`
			`return serie`


			`################################################################################`
			`### Step 6`
			`################################################################################`

			`# Create a list to store the text files`
			`texts=[]`

			`# Get all the text files in the text directory`
			`for file in os.listdir("text/" + domain + "/"):`

			`# Open the file and read the text`
Update web-qa.py 2023-02-06 17:28:31 +00:00			`with open("text/" + domain + "/" + file, "r", encoding="UTF-8") as f:`
Add in web crawl Q&A tutorial 2023-02-03 21:42:19 +00:00			`text = f.read()`

			`# Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.`
			`texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text))`

			`# Create a dataframe from the list of texts`
			`df = pd.DataFrame(texts, columns = ['fname', 'text'])`

			`# Set the text column to be the raw text with the newlines removed`
			`df['text'] = df.fname + ". " + remove_newlines(df.text)`
			`df.to_csv('processed/scraped.csv')`
			`df.head()`

			`################################################################################`
			`### Step 7`
			`################################################################################`

			`# Load the cl100k_base tokenizer which is designed to work with the ada-002 model`
			`tokenizer = tiktoken.get_encoding("cl100k_base")`

			`df = pd.read_csv('processed/scraped.csv', index_col=0)`
			`df.columns = ['title', 'text']`

			`# Tokenize the text and save the number of tokens to a new column`
			`df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))`

			`# Visualize the distribution of the number of tokens per row using a histogram`
			`df.n_tokens.hist()`

			`################################################################################`
			`### Step 8`
			`################################################################################`

			`max_tokens = 500`

			`# Function to split the text into chunks of a maximum number of tokens`
			`def split_into_many(text, max_tokens = max_tokens):`

			`# Split the text into sentences`
			`sentences = text.split('. ')`

			`# Get the number of tokens for each sentence`
			`n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]`

			`chunks = []`
			`tokens_so_far = 0`
			`chunk = []`

			`# Loop through the sentences and tokens joined together in a tuple`
			`for sentence, token in zip(sentences, n_tokens):`

			`# If the number of tokens so far plus the number of tokens in the current sentence is greater`
			`# than the max number of tokens, then add the chunk to the list of chunks and reset`
			`# the chunk and tokens so far`
			`if tokens_so_far + token > max_tokens:`
			`chunks.append(". ".join(chunk) + ".")`
			`chunk = []`
			`tokens_so_far = 0`

			`# If the number of tokens in the current sentence is greater than the max number of`
			`# tokens, go to the next sentence`
			`if token > max_tokens:`
			`continue`

			`# Otherwise, add the sentence to the chunk and add the number of tokens to the total`
			`chunk.append(sentence)`
			`tokens_so_far += token + 1`
Add handling for last chunk in split_into_sentences function I have added handling for the last chunk in the split_into_sentences function. Previously, the function did not account for the last chunk, which could lead to incomplete sentences in the output. To solve this, I added a conditional statement to check if the last chunk is non-empty. If it is, I append it to the list of chunks with a period to ensure the last sentence is complete. This change improves the accuracy of the split_into_sentences function and ensures that all sentences in the input text are properly segmented. Please review and let me know if you have any feedback or concerns. 2023-02-19 02:00:27 +00:00
			`# Add the last chunk to the list of chunks`
			`if chunk:`
			`chunks.append(". ".join(chunk) + ".")`
Add in web crawl Q&A tutorial 2023-02-03 21:42:19 +00:00
			`return chunks`


			`shortened = []`

			`# Loop through the dataframe`
			`for row in df.iterrows():`

			`# If the text is None, go to the next row`
			`if row[1]['text'] is None:`
			`continue`

			`# If the number of tokens is greater than the max number of tokens, split the text into chunks`
			`if row[1]['n_tokens'] > max_tokens:`
			`shortened += split_into_many(row[1]['text'])`

			`# Otherwise, add the text to the list of shortened texts`
			`else:`
			`shortened.append( row[1]['text'] )`

			`################################################################################`
			`### Step 9`
			`################################################################################`

			`df = pd.DataFrame(shortened, columns = ['text'])`
			`df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))`
			`df.n_tokens.hist()`

			`################################################################################`
			`### Step 10`
			`################################################################################`

Add comment on where to learn about rate limits 2023-02-17 12:16:14 +00:00			`# Note that you may run into rate limit issues depending on how many files you try to embed`
			`# Please check out our rate limit guide to learn more on how to handle this: https://platform.openai.com/docs/guides/rate-limits`

Add in web crawl Q&A tutorial 2023-02-03 21:42:19 +00:00			`df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])`
			`df.to_csv('processed/embeddings.csv')`
			`df.head()`

			`################################################################################`
			`### Step 11`
			`################################################################################`

			`df=pd.read_csv('processed/embeddings.csv', index_col=0)`
			`df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)`

			`df.head()`

			`################################################################################`
			`### Step 12`
			`################################################################################`

			`def create_context(`
			`question, df, max_len=1800, size="ada"`
			`):`
			`"""`
			`Create a context for a question by finding the most similar context from the dataframe`
			`"""`

			`# Get the embeddings for the question`
			`q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']`

			`# Get the distances from the embeddings`
			`df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')`


			`returns = []`
			`cur_len = 0`

			`# Sort by distance and add the text to the context until the context is too long`
			`for i, row in df.sort_values('distances', ascending=True).iterrows():`

			`# Add the length of the text to the current length`
			`cur_len += row['n_tokens'] + 4`

			`# If the context is too long, break`
			`if cur_len > max_len:`
			`break`

			`# Else add it to the text that is being returned`
			`returns.append(row["text"])`

			`# Return the context`
			`return "\n\n###\n\n".join(returns)`

			`def answer_question(`
			`df,`
			`model="text-davinci-003",`
			`question="Am I allowed to publish model outputs to Twitter, without a human review?",`
			`max_len=1800,`
			`size="ada",`
			`debug=False,`
			`max_tokens=150,`
			`stop_sequence=None`
			`):`
			`"""`
			`Answer a question based on the most similar context from the dataframe texts`
			`"""`
			`context = create_context(`
			`question,`
			`df,`
			`max_len=max_len,`
			`size=size,`
			`)`
			`# If debug, print the raw model response`
			`if debug:`
			`print("Context:\n" + context)`
			`print("\n\n")`

			`try:`
			`# Create a completions using the questin and context`
			`response = openai.Completion.create(`
			`prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:",`
			`temperature=0,`
			`max_tokens=max_tokens,`
			`top_p=1,`
			`frequency_penalty=0,`
			`presence_penalty=0,`
			`stop=stop_sequence,`
			`model=model,`
			`)`
			`return response["choices"][0]["text"].strip()`
			`except Exception as e:`
			`print(e)`
			`return ""`

			`################################################################################`
Update solutions/web_crawl_Q&A/web-qa.py 2023-02-03 21:43:22 +00:00			`### Step 13`
Add in web crawl Q&A tutorial 2023-02-03 21:42:19 +00:00			`################################################################################`

			`print(answer_question(df, question="What day is it?", debug=False))`

Fixed UnicodeEncodeError when writing to file This pull request fixes the UnicodeEncodeError that was occurring when writing to a file. Updated the multiple open() calls to specify UTF-8 encoding. 2023-02-03 22:55:06 +00:00			`print(answer_question(df, question="What is our newest embeddings model?"))`