mirror of
https://github.com/hwchase17/langchain
synced 2024-10-31 15:20:26 +00:00
156 lines
6.1 KiB
Plaintext
156 lines
6.1 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"attachments": {},
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# PySpark DataFrame Loader\n",
|
|
"\n",
|
|
"This notebook goes over how to load data from a [PySpark](https://spark.apache.org/docs/latest/api/python/) DataFrame."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#!pip install pyspark"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from pyspark.sql import SparkSession"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Setting default log level to \"WARN\".\n",
|
|
"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
|
|
"23/05/31 14:08:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"spark = SparkSession.builder.getOrCreate()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = spark.read.csv(\"example_data/mlb_teams_2012.csv\", header=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from langchain.document_loaders import PySparkDataFrameLoader"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"loader = PySparkDataFrameLoader(spark, df, page_content_column=\"Team\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[Stage 8:> (0 + 1) / 1]\r"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[Document(page_content='Nationals', metadata={' \"Payroll (millions)\"': ' 81.34', ' \"Wins\"': ' 98'}),\n",
|
|
" Document(page_content='Reds', metadata={' \"Payroll (millions)\"': ' 82.20', ' \"Wins\"': ' 97'}),\n",
|
|
" Document(page_content='Yankees', metadata={' \"Payroll (millions)\"': ' 197.96', ' \"Wins\"': ' 95'}),\n",
|
|
" Document(page_content='Giants', metadata={' \"Payroll (millions)\"': ' 117.62', ' \"Wins\"': ' 94'}),\n",
|
|
" Document(page_content='Braves', metadata={' \"Payroll (millions)\"': ' 83.31', ' \"Wins\"': ' 94'}),\n",
|
|
" Document(page_content='Athletics', metadata={' \"Payroll (millions)\"': ' 55.37', ' \"Wins\"': ' 94'}),\n",
|
|
" Document(page_content='Rangers', metadata={' \"Payroll (millions)\"': ' 120.51', ' \"Wins\"': ' 93'}),\n",
|
|
" Document(page_content='Orioles', metadata={' \"Payroll (millions)\"': ' 81.43', ' \"Wins\"': ' 93'}),\n",
|
|
" Document(page_content='Rays', metadata={' \"Payroll (millions)\"': ' 64.17', ' \"Wins\"': ' 90'}),\n",
|
|
" Document(page_content='Angels', metadata={' \"Payroll (millions)\"': ' 154.49', ' \"Wins\"': ' 89'}),\n",
|
|
" Document(page_content='Tigers', metadata={' \"Payroll (millions)\"': ' 132.30', ' \"Wins\"': ' 88'}),\n",
|
|
" Document(page_content='Cardinals', metadata={' \"Payroll (millions)\"': ' 110.30', ' \"Wins\"': ' 88'}),\n",
|
|
" Document(page_content='Dodgers', metadata={' \"Payroll (millions)\"': ' 95.14', ' \"Wins\"': ' 86'}),\n",
|
|
" Document(page_content='White Sox', metadata={' \"Payroll (millions)\"': ' 96.92', ' \"Wins\"': ' 85'}),\n",
|
|
" Document(page_content='Brewers', metadata={' \"Payroll (millions)\"': ' 97.65', ' \"Wins\"': ' 83'}),\n",
|
|
" Document(page_content='Phillies', metadata={' \"Payroll (millions)\"': ' 174.54', ' \"Wins\"': ' 81'}),\n",
|
|
" Document(page_content='Diamondbacks', metadata={' \"Payroll (millions)\"': ' 74.28', ' \"Wins\"': ' 81'}),\n",
|
|
" Document(page_content='Pirates', metadata={' \"Payroll (millions)\"': ' 63.43', ' \"Wins\"': ' 79'}),\n",
|
|
" Document(page_content='Padres', metadata={' \"Payroll (millions)\"': ' 55.24', ' \"Wins\"': ' 76'}),\n",
|
|
" Document(page_content='Mariners', metadata={' \"Payroll (millions)\"': ' 81.97', ' \"Wins\"': ' 75'}),\n",
|
|
" Document(page_content='Mets', metadata={' \"Payroll (millions)\"': ' 93.35', ' \"Wins\"': ' 74'}),\n",
|
|
" Document(page_content='Blue Jays', metadata={' \"Payroll (millions)\"': ' 75.48', ' \"Wins\"': ' 73'}),\n",
|
|
" Document(page_content='Royals', metadata={' \"Payroll (millions)\"': ' 60.91', ' \"Wins\"': ' 72'}),\n",
|
|
" Document(page_content='Marlins', metadata={' \"Payroll (millions)\"': ' 118.07', ' \"Wins\"': ' 69'}),\n",
|
|
" Document(page_content='Red Sox', metadata={' \"Payroll (millions)\"': ' 173.18', ' \"Wins\"': ' 69'}),\n",
|
|
" Document(page_content='Indians', metadata={' \"Payroll (millions)\"': ' 78.43', ' \"Wins\"': ' 68'}),\n",
|
|
" Document(page_content='Twins', metadata={' \"Payroll (millions)\"': ' 94.08', ' \"Wins\"': ' 66'}),\n",
|
|
" Document(page_content='Rockies', metadata={' \"Payroll (millions)\"': ' 78.06', ' \"Wins\"': ' 64'}),\n",
|
|
" Document(page_content='Cubs', metadata={' \"Payroll (millions)\"': ' 88.19', ' \"Wins\"': ' 61'}),\n",
|
|
" Document(page_content='Astros', metadata={' \"Payroll (millions)\"': ' 60.65', ' \"Wins\"': ' 55'})]"
|
|
]
|
|
},
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"loader.load()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.9"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|