{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# PySpark DataFrame Loader\n", "\n", "This notebook goes over how to load data from a [PySpark](https://spark.apache.org/docs/latest/api/python/) DataFrame." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "#!pip install pyspark" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from pyspark.sql import SparkSession" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", "23/05/31 14:08:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" ] } ], "source": [ "spark = SparkSession.builder.getOrCreate()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df = spark.read.csv(\"example_data/mlb_teams_2012.csv\", header=True)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from langchain.document_loaders import PySparkDataFrameLoader" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "loader = PySparkDataFrameLoader(spark, df, page_content_column=\"Team\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[Stage 8:> (0 + 1) / 1]\r" ] }, { "data": { "text/plain": [ "[Document(page_content='Nationals', metadata={' \"Payroll (millions)\"': ' 81.34', ' \"Wins\"': ' 98'}),\n", " Document(page_content='Reds', metadata={' \"Payroll (millions)\"': ' 82.20', ' \"Wins\"': ' 97'}),\n", " Document(page_content='Yankees', metadata={' \"Payroll (millions)\"': ' 197.96', ' \"Wins\"': ' 95'}),\n", " Document(page_content='Giants', metadata={' \"Payroll (millions)\"': ' 117.62', ' \"Wins\"': ' 94'}),\n", " Document(page_content='Braves', metadata={' \"Payroll (millions)\"': ' 83.31', ' \"Wins\"': ' 94'}),\n", " Document(page_content='Athletics', metadata={' \"Payroll (millions)\"': ' 55.37', ' \"Wins\"': ' 94'}),\n", " Document(page_content='Rangers', metadata={' \"Payroll (millions)\"': ' 120.51', ' \"Wins\"': ' 93'}),\n", " Document(page_content='Orioles', metadata={' \"Payroll (millions)\"': ' 81.43', ' \"Wins\"': ' 93'}),\n", " Document(page_content='Rays', metadata={' \"Payroll (millions)\"': ' 64.17', ' \"Wins\"': ' 90'}),\n", " Document(page_content='Angels', metadata={' \"Payroll (millions)\"': ' 154.49', ' \"Wins\"': ' 89'}),\n", " Document(page_content='Tigers', metadata={' \"Payroll (millions)\"': ' 132.30', ' \"Wins\"': ' 88'}),\n", " Document(page_content='Cardinals', metadata={' \"Payroll (millions)\"': ' 110.30', ' \"Wins\"': ' 88'}),\n", " Document(page_content='Dodgers', metadata={' \"Payroll (millions)\"': ' 95.14', ' \"Wins\"': ' 86'}),\n", " Document(page_content='White Sox', metadata={' \"Payroll (millions)\"': ' 96.92', ' \"Wins\"': ' 85'}),\n", " Document(page_content='Brewers', metadata={' \"Payroll (millions)\"': ' 97.65', ' \"Wins\"': ' 83'}),\n", " Document(page_content='Phillies', metadata={' \"Payroll (millions)\"': ' 174.54', ' \"Wins\"': ' 81'}),\n", " Document(page_content='Diamondbacks', metadata={' \"Payroll (millions)\"': ' 74.28', ' \"Wins\"': ' 81'}),\n", " Document(page_content='Pirates', metadata={' \"Payroll (millions)\"': ' 63.43', ' \"Wins\"': ' 79'}),\n", " Document(page_content='Padres', metadata={' \"Payroll (millions)\"': ' 55.24', ' \"Wins\"': ' 76'}),\n", " Document(page_content='Mariners', metadata={' \"Payroll (millions)\"': ' 81.97', ' \"Wins\"': ' 75'}),\n", " Document(page_content='Mets', metadata={' \"Payroll (millions)\"': ' 93.35', ' \"Wins\"': ' 74'}),\n", " Document(page_content='Blue Jays', metadata={' \"Payroll (millions)\"': ' 75.48', ' \"Wins\"': ' 73'}),\n", " Document(page_content='Royals', metadata={' \"Payroll (millions)\"': ' 60.91', ' \"Wins\"': ' 72'}),\n", " Document(page_content='Marlins', metadata={' \"Payroll (millions)\"': ' 118.07', ' \"Wins\"': ' 69'}),\n", " Document(page_content='Red Sox', metadata={' \"Payroll (millions)\"': ' 173.18', ' \"Wins\"': ' 69'}),\n", " Document(page_content='Indians', metadata={' \"Payroll (millions)\"': ' 78.43', ' \"Wins\"': ' 68'}),\n", " Document(page_content='Twins', metadata={' \"Payroll (millions)\"': ' 94.08', ' \"Wins\"': ' 66'}),\n", " Document(page_content='Rockies', metadata={' \"Payroll (millions)\"': ' 78.06', ' \"Wins\"': ' 64'}),\n", " Document(page_content='Cubs', metadata={' \"Payroll (millions)\"': ' 88.19', ' \"Wins\"': ' 61'}),\n", " Document(page_content='Astros', metadata={' \"Payroll (millions)\"': ' 60.65', ' \"Wins\"': ' 55'})]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loader.load()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 2 }