Merge pull request #102 from openai/dev/logan/web-crawl-qa

Add in web crawl Q&A tutorial
2024-11-04 06:00:33 +00:00 · 2023-02-03 15:43:51 -06:00 · 2023-02-03 15:43:51 -06:00 · 11c53c27be
commit 11c53c27be
parent d063f1d86d 6c3c68144c
3 changed files with 1747 additions and 0 deletions
--- a/solutions/web_crawl_Q&A/requirements.txt
+++ b/solutions/web_crawl_Q&A/requirements.txt
@ -0,0 +1,80 @@
+aiohttp==3.8.3
+aiosignal==1.3.1
+appnope==0.1.3
+asttokens==2.2.1
+async-timeout==4.0.2
+attrs==22.2.0
+backcall==0.2.0
+beautifulsoup4==4.11.1
+blobfile==2.0.1
+bs4==0.0.1
+certifi==2022.12.7
+charset-normalizer==2.1.1
+comm==0.1.2
+contourpy==1.0.7
+cycler==0.11.0
+debugpy==1.6.5
+decorator==5.1.1
+docopt==0.6.2
+entrypoints==0.4
+executing==1.2.0
+filelock==3.9.0
+fonttools==4.38.0
+frozenlist==1.3.3
+html==1.13
+huggingface-hub==0.11.1
+idna==3.4
+ipykernel==6.20.1
+ipython==8.8.0
+jedi==0.18.2
+joblib==1.2.0
+jupyter_client==7.4.8
+jupyter_core==5.1.3
+kiwisolver==1.4.4
+lxml==4.9.2
+matplotlib==3.6.3
+matplotlib-inline==0.1.6
+multidict==6.0.4
+nest-asyncio==1.5.6
+numpy==1.24.1
+openai==0.26.1
+packaging==23.0
+pandas==1.5.2
+parso==0.8.3
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==9.4.0
+pipreqs==0.4.11
+platformdirs==2.6.2
+plotly==5.12.0
+prompt-toolkit==3.0.36
+psutil==5.9.4
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pycryptodomex==3.17
+Pygments==2.14.0
+pyparsing==3.0.9
+python-dateutil==2.8.2
+pytz==2022.7.1
+PyYAML==6.0
+pyzmq==24.0.1
+regex==2022.10.31
+requests==2.28.1
+scikit-learn==1.2.0
+scipy==1.10.0
+six==1.16.0
+soupsieve==2.3.2.post1
+stack-data==0.6.2
+tenacity==8.1.0
+threadpoolctl==3.1.0
+tiktoken==0.1.2
+tokenizers==0.13.2
+tornado==6.2
+tqdm==4.64.1
+traitlets==5.8.1
+transformers==4.25.1
+typing_extensions==4.4.0
+urllib3==1.26.13
+wcwidth==0.2.5
+yarg==0.1.9
+yarl==1.8.2
--- a/solutions/web_crawl_Q&A/web-qa.ipynb
+++ b/solutions/web_crawl_Q&A/web-qa.ipynb
--- a/solutions/web_crawl_Q&A/web-qa.py
+++ b/solutions/web_crawl_Q&A/web-qa.py
@ -0,0 +1,382 @@
+################################################################################
+### Step 1
+################################################################################
+
+import requests
+import re
+import urllib.request
+from bs4 import BeautifulSoup
+from collections import deque
+from html.parser import HTMLParser
+from urllib.parse import urlparse
+import os
+import pandas as pd
+import tiktoken
+import openai
+from openai.embeddings_utils import distances_from_embeddings
+import pandas as pd
+import numpy as np
+from openai.embeddings_utils import distances_from_embeddings, cosine_similarity
+
+# Regex pattern to match a URL
+HTTP_URL_PATTERN = r'^http[s]*://.+'
+
+# Define root domain to crawl
+domain = "openai.com"
+full_url = "https://openai.com/"
+
+# Create a class to parse the HTML and get the hyperlinks
+class HyperlinkParser(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        # Create a list to store the hyperlinks
+        self.hyperlinks = []
+
+    # Override the HTMLParser's handle_starttag method to get the hyperlinks
+    def handle_starttag(self, tag, attrs):
+        attrs = dict(attrs)
+
+        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
+        if tag == "a" and "href" in attrs:
+            self.hyperlinks.append(attrs["href"])
+
+################################################################################
+### Step 2
+################################################################################
+
+# Function to get the hyperlinks from a URL
+def get_hyperlinks(url):
+    
+    # Try to open the URL and read the HTML
+    try:
+        # Open the URL and read the HTML
+        with urllib.request.urlopen(url) as response:
+
+            # If the response is not HTML, return an empty list
+            if not response.info().get('Content-Type').startswith("text/html"):
+                return []
+            
+            # Decode the HTML
+            html = response.read().decode('utf-8')
+    except Exception as e:
+        print(e)
+        return []
+
+    # Create the HTML Parser and then Parse the HTML to get hyperlinks
+    parser = HyperlinkParser()
+    parser.feed(html)
+
+    return parser.hyperlinks
+
+################################################################################
+### Step 3
+################################################################################
+
+# Function to get the hyperlinks from a URL that are within the same domain
+def get_domain_hyperlinks(local_domain, url):
+    clean_links = []
+    for link in set(get_hyperlinks(url)):
+        clean_link = None
+
+        # If the link is a URL, check if it is within the same domain
+        if re.search(HTTP_URL_PATTERN, link):
+            # Parse the URL and check if the domain is the same
+            url_obj = urlparse(link)
+            if url_obj.netloc == local_domain:
+                clean_link = link
+
+        # If the link is not a URL, check if it is a relative link
+        else:
+            if link.startswith("/"):
+                link = link[1:]
+            elif link.startswith("#") or link.startswith("mailto:"):
+                continue
+            clean_link = "https://" + local_domain + "/" + link
+
+        if clean_link is not None:
+            if clean_link.endswith("/"):
+                clean_link = clean_link[:-1]
+            clean_links.append(clean_link)
+
+    # Return the list of hyperlinks that are within the same domain
+    return list(set(clean_links))
+
+
+################################################################################
+### Step 4
+################################################################################
+
+def crawl(url):
+    # Parse the URL and get the domain
+    local_domain = urlparse(url).netloc
+
+    # Create a queue to store the URLs to crawl
+    queue = deque([url])
+
+    # Create a set to store the URLs that have already been seen (no duplicates)
+    seen = set([url])
+
+    # Create a directory to store the text files
+    if not os.path.exists("text/"):
+            os.mkdir("text/")
+
+    if not os.path.exists("text/"+local_domain+"/"):
+            os.mkdir("text/" + local_domain + "/")
+
+    # Create a directory to store the csv files
+    if not os.path.exists("processed"):
+            os.mkdir("processed")
+
+    # While the queue is not empty, continue crawling
+    while queue:
+
+        # Get the next URL from the queue
+        url = queue.pop()
+        print(url) # for debugging and to see the progress
+
+        # Save text from the url to a <url>.txt file
+        with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w") as f:
+
+            # Get the text from the URL using BeautifulSoup
+            soup = BeautifulSoup(requests.get(url).text, "html.parser")
+
+            # Get the text but remove the tags
+            text = soup.get_text()
+
+            # If the crawler gets to a page that requires JavaScript, it will stop the crawl
+            if ("You need to enable JavaScript to run this app." in text):
+                print("Unable to parse page " + url + " due to JavaScript being required")
+            
+            # Otherwise, write the text to the file in the text directory
+            f.write(text)
+
+        # Get the hyperlinks from the URL and add them to the queue
+        for link in get_domain_hyperlinks(local_domain, url):
+            if link not in seen:
+                queue.append(link)
+                seen.add(link)
+
+crawl(full_url)
+
+################################################################################
+### Step 5
+################################################################################
+
+def remove_newlines(serie):
+    serie = serie.str.replace('\n', ' ')
+    serie = serie.str.replace('\\n', ' ')
+    serie = serie.str.replace('  ', ' ')
+    serie = serie.str.replace('  ', ' ')
+    return serie
+
+
+################################################################################
+### Step 6
+################################################################################
+
+# Create a list to store the text files
+texts=[]
+
+# Get all the text files in the text directory
+for file in os.listdir("text/" + domain + "/"):
+
+    # Open the file and read the text
+    with open("text/" + domain + "/" + file, "r") as f:
+        text = f.read()
+
+        # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.
+        texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text))
+
+# Create a dataframe from the list of texts
+df = pd.DataFrame(texts, columns = ['fname', 'text'])
+
+# Set the text column to be the raw text with the newlines removed
+df['text'] = df.fname + ". " + remove_newlines(df.text)
+df.to_csv('processed/scraped.csv')
+df.head()
+
+################################################################################
+### Step 7
+################################################################################
+
+# Load the cl100k_base tokenizer which is designed to work with the ada-002 model
+tokenizer = tiktoken.get_encoding("cl100k_base")
+
+df = pd.read_csv('processed/scraped.csv', index_col=0)
+df.columns = ['title', 'text']
+
+# Tokenize the text and save the number of tokens to a new column
+df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
+
+# Visualize the distribution of the number of tokens per row using a histogram
+df.n_tokens.hist()
+
+################################################################################
+### Step 8
+################################################################################
+
+max_tokens = 500
+
+# Function to split the text into chunks of a maximum number of tokens
+def split_into_many(text, max_tokens = max_tokens):
+
+    # Split the text into sentences
+    sentences = text.split('. ')
+
+    # Get the number of tokens for each sentence
+    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
+    
+    chunks = []
+    tokens_so_far = 0
+    chunk = []
+
+    # Loop through the sentences and tokens joined together in a tuple
+    for sentence, token in zip(sentences, n_tokens):
+
+        # If the number of tokens so far plus the number of tokens in the current sentence is greater 
+        # than the max number of tokens, then add the chunk to the list of chunks and reset
+        # the chunk and tokens so far
+        if tokens_so_far + token > max_tokens:
+            chunks.append(". ".join(chunk) + ".")
+            chunk = []
+            tokens_so_far = 0
+
+        # If the number of tokens in the current sentence is greater than the max number of 
+        # tokens, go to the next sentence
+        if token > max_tokens:
+            continue
+
+        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
+        chunk.append(sentence)
+        tokens_so_far += token + 1
+
+    return chunks
+    
+
+shortened = []
+
+# Loop through the dataframe
+for row in df.iterrows():
+
+    # If the text is None, go to the next row
+    if row[1]['text'] is None:
+        continue
+
+    # If the number of tokens is greater than the max number of tokens, split the text into chunks
+    if row[1]['n_tokens'] > max_tokens:
+        shortened += split_into_many(row[1]['text'])
+    
+    # Otherwise, add the text to the list of shortened texts
+    else:
+        shortened.append( row[1]['text'] )
+
+################################################################################
+### Step 9
+################################################################################
+
+df = pd.DataFrame(shortened, columns = ['text'])
+df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
+df.n_tokens.hist()
+
+################################################################################
+### Step 10
+################################################################################
+
+df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])
+df.to_csv('processed/embeddings.csv')
+df.head()
+
+################################################################################
+### Step 11
+################################################################################
+
+df=pd.read_csv('processed/embeddings.csv', index_col=0)
+df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)
+
+df.head()
+
+################################################################################
+### Step 12
+################################################################################
+
+def create_context(
+    question, df, max_len=1800, size="ada"
+):
+    """
+    Create a context for a question by finding the most similar context from the dataframe
+    """
+
+    # Get the embeddings for the question
+    q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']
+
+    # Get the distances from the embeddings
+    df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')
+
+
+    returns = []
+    cur_len = 0
+
+    # Sort by distance and add the text to the context until the context is too long
+    for i, row in df.sort_values('distances', ascending=True).iterrows():
+        
+        # Add the length of the text to the current length
+        cur_len += row['n_tokens'] + 4
+        
+        # If the context is too long, break
+        if cur_len > max_len:
+            break
+        
+        # Else add it to the text that is being returned
+        returns.append(row["text"])
+
+    # Return the context
+    return "\n\n###\n\n".join(returns)
+
+def answer_question(
+    df,
+    model="text-davinci-003",
+    question="Am I allowed to publish model outputs to Twitter, without a human review?",
+    max_len=1800,
+    size="ada",
+    debug=False,
+    max_tokens=150,
+    stop_sequence=None
+):
+    """
+    Answer a question based on the most similar context from the dataframe texts
+    """
+    context = create_context(
+        question,
+        df,
+        max_len=max_len,
+        size=size,
+    )
+    # If debug, print the raw model response
+    if debug:
+        print("Context:\n" + context)
+        print("\n\n")
+
+    try:
+        # Create a completions using the questin and context
+        response = openai.Completion.create(
+            prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:",
+            temperature=0,
+            max_tokens=max_tokens,
+            top_p=1,
+            frequency_penalty=0,
+            presence_penalty=0,
+            stop=stop_sequence,
+            model=model,
+        )
+        return response["choices"][0]["text"].strip()
+    except Exception as e:
+        print(e)
+        return ""
+
+################################################################################
+### Step 13
+################################################################################
+
+print(answer_question(df, question="What day is it?", debug=False))
+
+print(answer_question(df, question="What is our newest embeddings model?"))