mirror of
https://github.com/openai/openai-cookbook
synced 2024-11-04 06:00:33 +00:00
Merge pull request #102 from openai/dev/logan/web-crawl-qa
Add in web crawl Q&A tutorial
This commit is contained in:
commit
11c53c27be
80
solutions/web_crawl_Q&A/requirements.txt
Normal file
80
solutions/web_crawl_Q&A/requirements.txt
Normal file
@ -0,0 +1,80 @@
|
||||
aiohttp==3.8.3
|
||||
aiosignal==1.3.1
|
||||
appnope==0.1.3
|
||||
asttokens==2.2.1
|
||||
async-timeout==4.0.2
|
||||
attrs==22.2.0
|
||||
backcall==0.2.0
|
||||
beautifulsoup4==4.11.1
|
||||
blobfile==2.0.1
|
||||
bs4==0.0.1
|
||||
certifi==2022.12.7
|
||||
charset-normalizer==2.1.1
|
||||
comm==0.1.2
|
||||
contourpy==1.0.7
|
||||
cycler==0.11.0
|
||||
debugpy==1.6.5
|
||||
decorator==5.1.1
|
||||
docopt==0.6.2
|
||||
entrypoints==0.4
|
||||
executing==1.2.0
|
||||
filelock==3.9.0
|
||||
fonttools==4.38.0
|
||||
frozenlist==1.3.3
|
||||
html==1.13
|
||||
huggingface-hub==0.11.1
|
||||
idna==3.4
|
||||
ipykernel==6.20.1
|
||||
ipython==8.8.0
|
||||
jedi==0.18.2
|
||||
joblib==1.2.0
|
||||
jupyter_client==7.4.8
|
||||
jupyter_core==5.1.3
|
||||
kiwisolver==1.4.4
|
||||
lxml==4.9.2
|
||||
matplotlib==3.6.3
|
||||
matplotlib-inline==0.1.6
|
||||
multidict==6.0.4
|
||||
nest-asyncio==1.5.6
|
||||
numpy==1.24.1
|
||||
openai==0.26.1
|
||||
packaging==23.0
|
||||
pandas==1.5.2
|
||||
parso==0.8.3
|
||||
pexpect==4.8.0
|
||||
pickleshare==0.7.5
|
||||
Pillow==9.4.0
|
||||
pipreqs==0.4.11
|
||||
platformdirs==2.6.2
|
||||
plotly==5.12.0
|
||||
prompt-toolkit==3.0.36
|
||||
psutil==5.9.4
|
||||
ptyprocess==0.7.0
|
||||
pure-eval==0.2.2
|
||||
pycryptodomex==3.17
|
||||
Pygments==2.14.0
|
||||
pyparsing==3.0.9
|
||||
python-dateutil==2.8.2
|
||||
pytz==2022.7.1
|
||||
PyYAML==6.0
|
||||
pyzmq==24.0.1
|
||||
regex==2022.10.31
|
||||
requests==2.28.1
|
||||
scikit-learn==1.2.0
|
||||
scipy==1.10.0
|
||||
six==1.16.0
|
||||
soupsieve==2.3.2.post1
|
||||
stack-data==0.6.2
|
||||
tenacity==8.1.0
|
||||
threadpoolctl==3.1.0
|
||||
tiktoken==0.1.2
|
||||
tokenizers==0.13.2
|
||||
tornado==6.2
|
||||
tqdm==4.64.1
|
||||
traitlets==5.8.1
|
||||
transformers==4.25.1
|
||||
typing_extensions==4.4.0
|
||||
urllib3==1.26.13
|
||||
wcwidth==0.2.5
|
||||
yarg==0.1.9
|
||||
yarl==1.8.2
|
1285
solutions/web_crawl_Q&A/web-qa.ipynb
Normal file
1285
solutions/web_crawl_Q&A/web-qa.ipynb
Normal file
File diff suppressed because one or more lines are too long
382
solutions/web_crawl_Q&A/web-qa.py
Normal file
382
solutions/web_crawl_Q&A/web-qa.py
Normal file
@ -0,0 +1,382 @@
|
||||
################################################################################
|
||||
### Step 1
|
||||
################################################################################
|
||||
|
||||
import requests
|
||||
import re
|
||||
import urllib.request
|
||||
from bs4 import BeautifulSoup
|
||||
from collections import deque
|
||||
from html.parser import HTMLParser
|
||||
from urllib.parse import urlparse
|
||||
import os
|
||||
import pandas as pd
|
||||
import tiktoken
|
||||
import openai
|
||||
from openai.embeddings_utils import distances_from_embeddings
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from openai.embeddings_utils import distances_from_embeddings, cosine_similarity
|
||||
|
||||
# Regex pattern to match a URL
|
||||
HTTP_URL_PATTERN = r'^http[s]*://.+'
|
||||
|
||||
# Define root domain to crawl
|
||||
domain = "openai.com"
|
||||
full_url = "https://openai.com/"
|
||||
|
||||
# Create a class to parse the HTML and get the hyperlinks
|
||||
class HyperlinkParser(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# Create a list to store the hyperlinks
|
||||
self.hyperlinks = []
|
||||
|
||||
# Override the HTMLParser's handle_starttag method to get the hyperlinks
|
||||
def handle_starttag(self, tag, attrs):
|
||||
attrs = dict(attrs)
|
||||
|
||||
# If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
|
||||
if tag == "a" and "href" in attrs:
|
||||
self.hyperlinks.append(attrs["href"])
|
||||
|
||||
################################################################################
|
||||
### Step 2
|
||||
################################################################################
|
||||
|
||||
# Function to get the hyperlinks from a URL
|
||||
def get_hyperlinks(url):
|
||||
|
||||
# Try to open the URL and read the HTML
|
||||
try:
|
||||
# Open the URL and read the HTML
|
||||
with urllib.request.urlopen(url) as response:
|
||||
|
||||
# If the response is not HTML, return an empty list
|
||||
if not response.info().get('Content-Type').startswith("text/html"):
|
||||
return []
|
||||
|
||||
# Decode the HTML
|
||||
html = response.read().decode('utf-8')
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return []
|
||||
|
||||
# Create the HTML Parser and then Parse the HTML to get hyperlinks
|
||||
parser = HyperlinkParser()
|
||||
parser.feed(html)
|
||||
|
||||
return parser.hyperlinks
|
||||
|
||||
################################################################################
|
||||
### Step 3
|
||||
################################################################################
|
||||
|
||||
# Function to get the hyperlinks from a URL that are within the same domain
|
||||
def get_domain_hyperlinks(local_domain, url):
|
||||
clean_links = []
|
||||
for link in set(get_hyperlinks(url)):
|
||||
clean_link = None
|
||||
|
||||
# If the link is a URL, check if it is within the same domain
|
||||
if re.search(HTTP_URL_PATTERN, link):
|
||||
# Parse the URL and check if the domain is the same
|
||||
url_obj = urlparse(link)
|
||||
if url_obj.netloc == local_domain:
|
||||
clean_link = link
|
||||
|
||||
# If the link is not a URL, check if it is a relative link
|
||||
else:
|
||||
if link.startswith("/"):
|
||||
link = link[1:]
|
||||
elif link.startswith("#") or link.startswith("mailto:"):
|
||||
continue
|
||||
clean_link = "https://" + local_domain + "/" + link
|
||||
|
||||
if clean_link is not None:
|
||||
if clean_link.endswith("/"):
|
||||
clean_link = clean_link[:-1]
|
||||
clean_links.append(clean_link)
|
||||
|
||||
# Return the list of hyperlinks that are within the same domain
|
||||
return list(set(clean_links))
|
||||
|
||||
|
||||
################################################################################
|
||||
### Step 4
|
||||
################################################################################
|
||||
|
||||
def crawl(url):
|
||||
# Parse the URL and get the domain
|
||||
local_domain = urlparse(url).netloc
|
||||
|
||||
# Create a queue to store the URLs to crawl
|
||||
queue = deque([url])
|
||||
|
||||
# Create a set to store the URLs that have already been seen (no duplicates)
|
||||
seen = set([url])
|
||||
|
||||
# Create a directory to store the text files
|
||||
if not os.path.exists("text/"):
|
||||
os.mkdir("text/")
|
||||
|
||||
if not os.path.exists("text/"+local_domain+"/"):
|
||||
os.mkdir("text/" + local_domain + "/")
|
||||
|
||||
# Create a directory to store the csv files
|
||||
if not os.path.exists("processed"):
|
||||
os.mkdir("processed")
|
||||
|
||||
# While the queue is not empty, continue crawling
|
||||
while queue:
|
||||
|
||||
# Get the next URL from the queue
|
||||
url = queue.pop()
|
||||
print(url) # for debugging and to see the progress
|
||||
|
||||
# Save text from the url to a <url>.txt file
|
||||
with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w") as f:
|
||||
|
||||
# Get the text from the URL using BeautifulSoup
|
||||
soup = BeautifulSoup(requests.get(url).text, "html.parser")
|
||||
|
||||
# Get the text but remove the tags
|
||||
text = soup.get_text()
|
||||
|
||||
# If the crawler gets to a page that requires JavaScript, it will stop the crawl
|
||||
if ("You need to enable JavaScript to run this app." in text):
|
||||
print("Unable to parse page " + url + " due to JavaScript being required")
|
||||
|
||||
# Otherwise, write the text to the file in the text directory
|
||||
f.write(text)
|
||||
|
||||
# Get the hyperlinks from the URL and add them to the queue
|
||||
for link in get_domain_hyperlinks(local_domain, url):
|
||||
if link not in seen:
|
||||
queue.append(link)
|
||||
seen.add(link)
|
||||
|
||||
crawl(full_url)
|
||||
|
||||
################################################################################
|
||||
### Step 5
|
||||
################################################################################
|
||||
|
||||
def remove_newlines(serie):
|
||||
serie = serie.str.replace('\n', ' ')
|
||||
serie = serie.str.replace('\\n', ' ')
|
||||
serie = serie.str.replace(' ', ' ')
|
||||
serie = serie.str.replace(' ', ' ')
|
||||
return serie
|
||||
|
||||
|
||||
################################################################################
|
||||
### Step 6
|
||||
################################################################################
|
||||
|
||||
# Create a list to store the text files
|
||||
texts=[]
|
||||
|
||||
# Get all the text files in the text directory
|
||||
for file in os.listdir("text/" + domain + "/"):
|
||||
|
||||
# Open the file and read the text
|
||||
with open("text/" + domain + "/" + file, "r") as f:
|
||||
text = f.read()
|
||||
|
||||
# Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.
|
||||
texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text))
|
||||
|
||||
# Create a dataframe from the list of texts
|
||||
df = pd.DataFrame(texts, columns = ['fname', 'text'])
|
||||
|
||||
# Set the text column to be the raw text with the newlines removed
|
||||
df['text'] = df.fname + ". " + remove_newlines(df.text)
|
||||
df.to_csv('processed/scraped.csv')
|
||||
df.head()
|
||||
|
||||
################################################################################
|
||||
### Step 7
|
||||
################################################################################
|
||||
|
||||
# Load the cl100k_base tokenizer which is designed to work with the ada-002 model
|
||||
tokenizer = tiktoken.get_encoding("cl100k_base")
|
||||
|
||||
df = pd.read_csv('processed/scraped.csv', index_col=0)
|
||||
df.columns = ['title', 'text']
|
||||
|
||||
# Tokenize the text and save the number of tokens to a new column
|
||||
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
|
||||
|
||||
# Visualize the distribution of the number of tokens per row using a histogram
|
||||
df.n_tokens.hist()
|
||||
|
||||
################################################################################
|
||||
### Step 8
|
||||
################################################################################
|
||||
|
||||
max_tokens = 500
|
||||
|
||||
# Function to split the text into chunks of a maximum number of tokens
|
||||
def split_into_many(text, max_tokens = max_tokens):
|
||||
|
||||
# Split the text into sentences
|
||||
sentences = text.split('. ')
|
||||
|
||||
# Get the number of tokens for each sentence
|
||||
n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
|
||||
|
||||
chunks = []
|
||||
tokens_so_far = 0
|
||||
chunk = []
|
||||
|
||||
# Loop through the sentences and tokens joined together in a tuple
|
||||
for sentence, token in zip(sentences, n_tokens):
|
||||
|
||||
# If the number of tokens so far plus the number of tokens in the current sentence is greater
|
||||
# than the max number of tokens, then add the chunk to the list of chunks and reset
|
||||
# the chunk and tokens so far
|
||||
if tokens_so_far + token > max_tokens:
|
||||
chunks.append(". ".join(chunk) + ".")
|
||||
chunk = []
|
||||
tokens_so_far = 0
|
||||
|
||||
# If the number of tokens in the current sentence is greater than the max number of
|
||||
# tokens, go to the next sentence
|
||||
if token > max_tokens:
|
||||
continue
|
||||
|
||||
# Otherwise, add the sentence to the chunk and add the number of tokens to the total
|
||||
chunk.append(sentence)
|
||||
tokens_so_far += token + 1
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
shortened = []
|
||||
|
||||
# Loop through the dataframe
|
||||
for row in df.iterrows():
|
||||
|
||||
# If the text is None, go to the next row
|
||||
if row[1]['text'] is None:
|
||||
continue
|
||||
|
||||
# If the number of tokens is greater than the max number of tokens, split the text into chunks
|
||||
if row[1]['n_tokens'] > max_tokens:
|
||||
shortened += split_into_many(row[1]['text'])
|
||||
|
||||
# Otherwise, add the text to the list of shortened texts
|
||||
else:
|
||||
shortened.append( row[1]['text'] )
|
||||
|
||||
################################################################################
|
||||
### Step 9
|
||||
################################################################################
|
||||
|
||||
df = pd.DataFrame(shortened, columns = ['text'])
|
||||
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
|
||||
df.n_tokens.hist()
|
||||
|
||||
################################################################################
|
||||
### Step 10
|
||||
################################################################################
|
||||
|
||||
df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])
|
||||
df.to_csv('processed/embeddings.csv')
|
||||
df.head()
|
||||
|
||||
################################################################################
|
||||
### Step 11
|
||||
################################################################################
|
||||
|
||||
df=pd.read_csv('processed/embeddings.csv', index_col=0)
|
||||
df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)
|
||||
|
||||
df.head()
|
||||
|
||||
################################################################################
|
||||
### Step 12
|
||||
################################################################################
|
||||
|
||||
def create_context(
|
||||
question, df, max_len=1800, size="ada"
|
||||
):
|
||||
"""
|
||||
Create a context for a question by finding the most similar context from the dataframe
|
||||
"""
|
||||
|
||||
# Get the embeddings for the question
|
||||
q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']
|
||||
|
||||
# Get the distances from the embeddings
|
||||
df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')
|
||||
|
||||
|
||||
returns = []
|
||||
cur_len = 0
|
||||
|
||||
# Sort by distance and add the text to the context until the context is too long
|
||||
for i, row in df.sort_values('distances', ascending=True).iterrows():
|
||||
|
||||
# Add the length of the text to the current length
|
||||
cur_len += row['n_tokens'] + 4
|
||||
|
||||
# If the context is too long, break
|
||||
if cur_len > max_len:
|
||||
break
|
||||
|
||||
# Else add it to the text that is being returned
|
||||
returns.append(row["text"])
|
||||
|
||||
# Return the context
|
||||
return "\n\n###\n\n".join(returns)
|
||||
|
||||
def answer_question(
|
||||
df,
|
||||
model="text-davinci-003",
|
||||
question="Am I allowed to publish model outputs to Twitter, without a human review?",
|
||||
max_len=1800,
|
||||
size="ada",
|
||||
debug=False,
|
||||
max_tokens=150,
|
||||
stop_sequence=None
|
||||
):
|
||||
"""
|
||||
Answer a question based on the most similar context from the dataframe texts
|
||||
"""
|
||||
context = create_context(
|
||||
question,
|
||||
df,
|
||||
max_len=max_len,
|
||||
size=size,
|
||||
)
|
||||
# If debug, print the raw model response
|
||||
if debug:
|
||||
print("Context:\n" + context)
|
||||
print("\n\n")
|
||||
|
||||
try:
|
||||
# Create a completions using the questin and context
|
||||
response = openai.Completion.create(
|
||||
prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:",
|
||||
temperature=0,
|
||||
max_tokens=max_tokens,
|
||||
top_p=1,
|
||||
frequency_penalty=0,
|
||||
presence_penalty=0,
|
||||
stop=stop_sequence,
|
||||
model=model,
|
||||
)
|
||||
return response["choices"][0]["text"].strip()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return ""
|
||||
|
||||
################################################################################
|
||||
### Step 13
|
||||
################################################################################
|
||||
|
||||
print(answer_question(df, question="What day is it?", debug=False))
|
||||
|
||||
print(answer_question(df, question="What is our newest embeddings model?"))
|
Loading…
Reference in New Issue
Block a user