{ "cells": [ { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "fr_txRZ1i1_4" }, "source": [ "\"Open" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "fFqeiV3qxso7" }, "outputs": [], "source": [ "################ template to run PySpark on Colab #######################" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 35868, "status": "ok", "timestamp": 1592469024441, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "m6uGlMRavWlb" }, "outputs": [], "source": [ "!apt-get install openjdk-8-jdk-headless -qq > /dev/null\n", "!wget -q https://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz\n", "!tar xf spark-2.4.5-bin-hadoop2.7.tgz\n", "!pip install -q findspark" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 35291, "status": "ok", "timestamp": 1592469024442, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "f75mt5iLvaI3" }, "outputs": [], "source": [ "import os\n", "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-8-openjdk-amd64\"\n", "os.environ[\"SPARK_HOME\"] = \"/content/spark-2.4.5-bin-hadoop2.7\"" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 39386, "status": "ok", "timestamp": 1592469029149, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "X1NyKr7qvaGP" }, "outputs": [], "source": [ "import findspark\n", "findspark.init()\n", "\n", "from pyspark.sql import SparkSession\n", "spark = SparkSession.builder.master(\"local[*]\").getOrCreate()\n", "spark1 = SparkSession.builder.appName('basic').getOrCreate()\n", "#Test must give no error" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 38615, "status": "ok", "timestamp": 1592469029151, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "MXYgwjn4vaAH" }, "outputs": [], "source": [ "import pyspark" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 38056, "status": "ok", "timestamp": 1592469029153, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "08V9HAqMxm9D" }, "outputs": [], "source": [ "################ end template PySpark on Colab ##########################" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 37584, "status": "ok", "timestamp": 1592469029154, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "rkhXfoJzCOtN" }, "outputs": [], "source": [ "from pyspark import SparkConf, SparkContext\n", "conf = SparkConf().setAppName(\"basic\").setMaster(\"local\")\n", "#sc = SparkContext(conf=conf) ## for jupyter and Databricks\n", "sc = SparkContext.getOrCreate() ## for Colab" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 521, "status": "ok", "timestamp": 1592469074827, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "7BYL1qr3jLh-" }, "outputs": [], "source": [ "from pyspark.sql.types import *" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "b7hyPjIpjgrq" }, "source": [ "# Dataframe" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load dataset" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 648 }, "colab_type": "code", "executionInfo": { "elapsed": 8313, "status": "ok", "timestamp": 1592469116466, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "pOIesB3ijLfW", "outputId": "b60a5d4c-198b-4ef0-839a-3763450c0ab3" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2020-06-18 08:31:49-- https://frenzy86.s3.eu-west-2.amazonaws.com/fav/tecno/shirts.csv\n", "Resolving frenzy86.s3.eu-west-2.amazonaws.com (frenzy86.s3.eu-west-2.amazonaws.com)... 52.95.150.54\n", "Connecting to frenzy86.s3.eu-west-2.amazonaws.com (frenzy86.s3.eu-west-2.amazonaws.com)|52.95.150.54|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 1741 (1.7K) [application/vnd.ms-excel]\n", "Saving to: ‘shirts.csv’\n", "\n", "\r", "shirts.csv 0%[ ] 0 --.-KB/s \r", "shirts.csv 100%[===================>] 1.70K --.-KB/s in 0s \n", "\n", "2020-06-18 08:31:49 (79.7 MB/s) - ‘shirts.csv’ saved [1741/1741]\n", "\n", "+---+------+------+------+\n", "|_c0|taglia|colore|prezzo|\n", "+---+------+------+------+\n", "| 0| S|bianco| 4.99|\n", "| 1| M|bianco| 19.99|\n", "| 2| XL|bianco| 12.49|\n", "| 3| XL|bianco| 14.99|\n", "| 4| S|bianco| 14.99|\n", "| 5| S| verde| 7.99|\n", "| 6| M| verde| 4.99|\n", "| 7| L| verde| 12.49|\n", "| 8| XL|bianco| 12.49|\n", "| 9| M| verde| 19.99|\n", "| 10| L|bianco| 14.99|\n", "| 11| XL|bianco| 19.99|\n", "| 12| M|bianco| 4.99|\n", "| 13| L|bianco| 7.99|\n", "| 14| M|bianco| 14.99|\n", "| 15| XL| rosso| 9.99|\n", "| 16| S| rosso| 12.49|\n", "| 17| L|bianco| 7.99|\n", "| 18| XL|bianco| 4.99|\n", "| 19| M| verde| 14.99|\n", "+---+------+------+------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "!wget https://frenzy86.s3.eu-west-2.amazonaws.com/fav/tecno/shirts.csv\n", "df0 = spark.read.load(\"shirts.csv\", format=\"csv\", sep=\",\", inferSchema=\"true\", header=\"true\")\n", "df0.show()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "Ujd-bI4Ijmuc" }, "source": [ "## Creazione dataframe" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "N3ysjRE7jqYP" }, "source": [ "Possiamo creare un nuovo Dataframe usando il metodo .createDataFrame(data, names) dell'oggetto SparkSession, questo metodo ha bisogno di due parametri:\n", "\n", " Una lista di tuple, in cui ogni tupla corrisponde ad una riga del Dataframe.\n", " Una lista con i nomi per le colonne\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 488, "status": "ok", "timestamp": 1592469193232, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "D2QjoH8cjLdX" }, "outputs": [], "source": [ "data = [(\"Gianluca\", \"M\", 23, 174, 70.5),\n", " (\"Andrea\", \"M\", 37, 179, 68.),\n", " (\"Marco\", \"M\", 33, 172, 88.5),\n", " (\"Annalisa\", \"F\", 38, 155, 50.2),\n", " (\"Monica\", \"F\", 25, 165, 54.3)]\n", "\n", "df = spark.createDataFrame(data, [\"name\", \"gender\", \"age\", \"height\",\"weight\"])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 187 }, "colab_type": "code", "executionInfo": { "elapsed": 1606, "status": "ok", "timestamp": 1592469204952, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "oJdXVTEGjLa7", "outputId": "63a8e297-aa5e-4d52-b372-1ef9d34e8fd0" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+------+---+------+------+\n", "| name|gender|age|height|weight|\n", "+--------+------+---+------+------+\n", "|Gianluca| M| 23| 174| 70.5|\n", "| Andrea| M| 37| 179| 68.0|\n", "| Marco| M| 33| 172| 88.5|\n", "|Annalisa| F| 38| 155| 50.2|\n", "| Monica| F| 25| 165| 54.3|\n", "+--------+------+---+------+------+\n", "\n" ] } ], "source": [ "df.show()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 170 }, "colab_type": "code", "executionInfo": { "elapsed": 698, "status": "ok", "timestamp": 1592469213483, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "dMuHKISSjLWn", "outputId": "106e3ef3-0b99-43d0-d8fb-98f39fdfeabd" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+------+---+------+------+\n", "| name|gender|age|height|weight|\n", "+--------+------+---+------+------+\n", "|Gianluca| M| 23| 174| 70.5|\n", "| Andrea| M| 37| 179| 68.0|\n", "| Marco| M| 33| 172| 88.5|\n", "+--------+------+---+------+------+\n", "only showing top 3 rows\n", "\n" ] } ], "source": [ "df.show(3)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "executionInfo": { "elapsed": 478, "status": "ok", "timestamp": 1592469225731, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "SyY-kaRKjLUs", "outputId": "448d0657-92a7-4ab0-9f15-fb0882b7eed4" }, "outputs": [ { "data": { "text/plain": [ "['name', 'gender', 'age', 'height', 'weight']" ] }, "execution_count": 13, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "df.columns" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "VixPqRkHkC2t" }, "source": [ "Per stampare lo schema del Dataframe, cioè le informazioni legate ad ogni attributo (nome, tipo, se può essere null), possiamo usare il metodo .printSchema()." ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 136 }, "colab_type": "code", "executionInfo": { "elapsed": 514, "status": "ok", "timestamp": 1592469244654, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "imckHZ3RjLR8", "outputId": "bd6587db-8f01-4087-8da3-f2a77e5b23ba" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- name: string (nullable = true)\n", " |-- gender: string (nullable = true)\n", " |-- age: long (nullable = true)\n", " |-- height: long (nullable = true)\n", " |-- weight: double (nullable = true)\n", "\n" ] } ], "source": [ "df.printSchema()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "s1v_GGtSkHjW" }, "source": [ "Possiamo visualizzare una serie di informazioni statistiche (count, valore medio, deviazione standard, valore minimo e massimo) usando il metodo .describe()," ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 187 }, "colab_type": "code", "executionInfo": { "elapsed": 2104, "status": "ok", "timestamp": 1592469273206, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "PP39tJi7jLQB", "outputId": "3b2d9cb5-f9e4-47a2-b1a7-fcf8d06d8a1c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+------+------+-----------------+-----------------+-----------------+\n", "|summary| name|gender| age| height| weight|\n", "+-------+------+------+-----------------+-----------------+-----------------+\n", "| count| 5| 5| 5| 5| 5|\n", "| mean| null| null| 31.2| 169.0| 66.3|\n", "| stddev| null| null|6.870225614927067|9.300537618869138|15.13753612712452|\n", "| min|Andrea| F| 23| 155| 50.2|\n", "| max|Monica| M| 38| 179| 88.5|\n", "+-------+------+------+-----------------+-----------------+-----------------+\n", "\n" ] } ], "source": [ "df.describe().show()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "0L7HTHWwkNPb" }, "source": [ "## Modificare lo schema" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "93s1lvaOkUIW" }, "source": [ "lo schema del Dataframe è stato estratto direttamente dai dati, ma se volessimo definirlo noi ?\n", "\n", " age: da long a intero.\n", " height: da long a intero.\n", " weight: da double a float.\n", "\n", "Possiamo farlo creando uno schema per poi passarlo al metodo .createDataFrame(data, schema)." ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 912, "status": "ok", "timestamp": 1592469346823, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "I_oS6Kb_jLNh" }, "outputs": [], "source": [ "data_schema = [StructField('name', StringType(), True),\n", " StructField('gender', StringType(), True),\n", " StructField('age', IntegerType(), True),\n", " StructField('height', IntegerType(), True),\n", " StructField('weight', FloatType(), True)]\n", " \n", "schema = StructType(fields=data_schema)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "hf8GySqokjBd" }, "source": [ "Adesso creiamo il Dataframe, passando i dati e lo schema all'interno del parametro schema." ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 187 }, "colab_type": "code", "executionInfo": { "elapsed": 832, "status": "ok", "timestamp": 1592469358962, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "1q_XLDFUjKzp", "outputId": "374784d3-e140-4900-debf-99e599519456" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+------+---+------+------+\n", "| name|gender|age|height|weight|\n", "+--------+------+---+------+------+\n", "|Gianluca| M| 23| 174| 70.5|\n", "| Andrea| M| 37| 179| 68.0|\n", "| Marco| M| 33| 172| 88.5|\n", "|Annalisa| F| 38| 155| 50.2|\n", "| Monica| F| 25| 165| 54.3|\n", "+--------+------+---+------+------+\n", "\n" ] } ], "source": [ "df = spark.createDataFrame(data, schema=schema)\n", "df.show()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 136 }, "colab_type": "code", "executionInfo": { "elapsed": 680, "status": "ok", "timestamp": 1592469386326, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "xg2DX1_Ui2AR", "outputId": "26fc8b55-4297-4979-f721-b03ed041c2ad" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- name: string (nullable = true)\n", " |-- gender: string (nullable = true)\n", " |-- age: integer (nullable = true)\n", " |-- height: integer (nullable = true)\n", " |-- weight: float (nullable = true)\n", "\n" ] } ], "source": [ "df.printSchema()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "CIHc38arkpM4" }, "source": [ "## Righe Colonne" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 102 }, "colab_type": "code", "executionInfo": { "elapsed": 880, "status": "ok", "timestamp": 1592469409572, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "vHmn7T5vi2AU", "outputId": "773534b9-8162-42cc-9a6b-2cc3a1110d00" }, "outputs": [ { "data": { "text/plain": [ "[Row(name='Gianluca', gender='M', age=23, height=174, weight=70.5),\n", " Row(name='Andrea', gender='M', age=37, height=179, weight=68.0),\n", " Row(name='Marco', gender='M', age=33, height=172, weight=88.5),\n", " Row(name='Annalisa', gender='F', age=38, height=155, weight=50.20000076293945),\n", " Row(name='Monica', gender='F', age=25, height=165, weight=54.29999923706055)]" ] }, "execution_count": 19, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "df.head(5)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "ZrN9IQf-kvoy" }, "source": [ "Per selezionare solo una colonna del Dataframe possiamo usare l'indice o il nome" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "executionInfo": { "elapsed": 513, "status": "ok", "timestamp": 1592469428469, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "3xmAv249kstp", "outputId": "3520cb27-c19d-47c4-c6cf-5b84ad709de1" }, "outputs": [ { "data": { "text/plain": [ "Column" ] }, "execution_count": 20, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "df[0]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "executionInfo": { "elapsed": 633, "status": "ok", "timestamp": 1592469439838, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "9veTgBO1kyVT", "outputId": "d97275ac-157e-495f-fe43-baee3d24275e" }, "outputs": [ { "data": { "text/plain": [ "Column" ] }, "execution_count": 21, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "df[\"name\"]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 187 }, "colab_type": "code", "executionInfo": { "elapsed": 812, "status": "ok", "timestamp": 1592469470952, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "vIndgQIjkySq", "outputId": "87c4e804-af7c-45a4-a71c-7a89c6e80d69" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+\n", "| name|\n", "+--------+\n", "|Gianluca|\n", "| Andrea|\n", "| Marco|\n", "|Annalisa|\n", "| Monica|\n", "+--------+\n", "\n" ] } ], "source": [ "dfName = df.select(\"name\")\n", "dfName.show()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "wM5eRDdBk-cT" }, "source": [ "Possiamo usare lo stesso metodo per selezionare più colonne, passando una lista di nomi come parametro." ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 187 }, "colab_type": "code", "executionInfo": { "elapsed": 754, "status": "ok", "timestamp": 1592469491680, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "UQGaar-bkyQu", "outputId": "cfd27979-6b8a-4d9a-f17b-6c90c65db52f" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+---+\n", "| name|age|\n", "+--------+---+\n", "|Gianluca| 23|\n", "| Andrea| 37|\n", "| Marco| 33|\n", "|Annalisa| 38|\n", "| Monica| 25|\n", "+--------+---+\n", "\n" ] } ], "source": [ "df.select([\"name\",\"age\"]).show()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "npibj1dBlCy8" }, "source": [ "## Creare e modificare colonne" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "x16t8RImlF2D" }, "source": [ "Possiamo modificare una determinata colonna utilizzando il metodo .withColumn(name, column). alla quale dovremo passare il nome della riga che dovremmo modificare e un oggetto colonna" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 187 }, "colab_type": "code", "executionInfo": { "elapsed": 712, "status": "ok", "timestamp": 1592469524211, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "o3R_hPAHkyPZ", "outputId": "a41a6bb0-311f-4ee2-fad9-bf0f99a6b7e2" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+------+---+------+------+\n", "| name|gender|age|height|weight|\n", "+--------+------+---+------+------+\n", "|Gianluca| M| 23| 1.74| 70.5|\n", "| Andrea| M| 37| 1.79| 68.0|\n", "| Marco| M| 33| 1.72| 88.5|\n", "|Annalisa| F| 38| 1.55| 50.2|\n", "| Monica| F| 25| 1.65| 54.3|\n", "+--------+------+---+------+------+\n", "\n" ] } ], "source": [ "df = df.withColumn(\"height\", df[\"height\"]/100)\n", "df.show()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 136 }, "colab_type": "code", "executionInfo": { "elapsed": 680, "status": "ok", "timestamp": 1592469541057, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "fjwgQ0wTkyM_", "outputId": "e4adc5f2-e370-4923-bf30-2f714293c03f" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- name: string (nullable = true)\n", " |-- gender: string (nullable = true)\n", " |-- age: integer (nullable = true)\n", " |-- height: double (nullable = true)\n", " |-- weight: float (nullable = true)\n", "\n" ] } ], "source": [ "df.printSchema()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "nZYD5aMblQmF" }, "source": [ "Lo schema è stato modificato automaticamente, dato che abbiamo convertito l'altezza da numeri interi a numeri con la virgola. " ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "In501RU7lWD2" }, "source": [ "l'indice di massa corporea (bmi) di ogni persona:\n", "$$ bmi = (\\frac{weight}{height})^2 $$\n" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 187 }, "colab_type": "code", "executionInfo": { "elapsed": 701, "status": "ok", "timestamp": 1592469596192, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "cEjsnEsjkyLW", "outputId": "674ed155-361f-467b-f45d-f8d788599317" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+------+---+------+------+------------------+\n", "| name|gender|age|height|weight| bmi|\n", "+--------+------+---+------+------+------------------+\n", "|Gianluca| M| 23| 1.74| 70.5|23.285770907649624|\n", "| Andrea| M| 37| 1.79| 68.0| 21.22280827689523|\n", "| Marco| M| 33| 1.72| 88.5| 29.91481882098432|\n", "|Annalisa| F| 38| 1.55| 50.2| 20.89490146220164|\n", "| Monica| F| 25| 1.65| 54.3|19.944903301032344|\n", "+--------+------+---+------+------+------------------+\n", "\n" ] } ], "source": [ "bmi = df[\"weight\"]/(df[\"height\"]**2)\n", "df = df.withColumn(\"bmi\", bmi)\n", "df.show()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 187 }, "colab_type": "code", "executionInfo": { "elapsed": 1012, "status": "ok", "timestamp": 1592469617720, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "P9LBDKB_kyI7", "outputId": "f378ff01-6aff-4935-f29a-9242072a39ea" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+------+---+------+------+-----+\n", "| name|gender|age|height|weight| bmi|\n", "+--------+------+---+------+------+-----+\n", "|Gianluca| M| 23| 1.74| 70.5|23.29|\n", "| Andrea| M| 37| 1.79| 68.0|21.22|\n", "| Marco| M| 33| 1.72| 88.5|29.91|\n", "|Annalisa| F| 38| 1.55| 50.2|20.89|\n", "| Monica| F| 25| 1.65| 54.3|19.94|\n", "+--------+------+---+------+------+-----+\n", "\n" ] } ], "source": [ "from pyspark.sql.functions import round\n", "\n", "df = df.withColumn(\"bmi\", round(df[\"bmi\"], 2))\n", "df.show()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 187 }, "colab_type": "code", "executionInfo": { "elapsed": 810, "status": "ok", "timestamp": 1592469643096, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "HI9PP7uNkyIA", "outputId": "48883ac1-0a4a-4319-b263-1f1e13e4625f" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+------+---+------+------+-----+------+\n", "| name|gender|age|height|weight| bmi|is_fat|\n", "+--------+------+---+------+------+-----+------+\n", "|Gianluca| M| 23| 1.74| 70.5|23.29| false|\n", "| Andrea| M| 37| 1.79| 68.0|21.22| false|\n", "| Marco| M| 33| 1.72| 88.5|29.91| true|\n", "|Annalisa| F| 38| 1.55| 50.2|20.89| false|\n", "| Monica| F| 25| 1.65| 54.3|19.94| false|\n", "+--------+------+---+------+------+-----+------+\n", "\n" ] } ], "source": [ "from pyspark.sql.functions import col, when\n", "\n", "df = df.withColumn(\"is_fat\", when(col(\"bmi\")>25, True).otherwise(False))\n", "df.show()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "PKyseaydmHNJ" }, "source": [ "Per finire rinominiamo la colonna gender in sex, possiamo farlo tramite il metodo .withColumnRenamed(old_name, new_name)." ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 187 }, "colab_type": "code", "executionInfo": { "elapsed": 881, "status": "ok", "timestamp": 1592469788286, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "kdJr0ytKkyFO", "outputId": "924e2a0e-a751-4e56-cfef-50efea664706" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+---+---+------+------+-----+------+\n", "| name|sex|age|height|weight| bmi|is_fat|\n", "+--------+---+---+------+------+-----+------+\n", "|Gianluca| M| 23| 1.74| 70.5|23.29| false|\n", "| Andrea| M| 37| 1.79| 68.0|21.22| false|\n", "| Marco| M| 33| 1.72| 88.5|29.91| true|\n", "|Annalisa| F| 38| 1.55| 50.2|20.89| false|\n", "| Monica| F| 25| 1.65| 54.3|19.94| false|\n", "+--------+---+---+------+------+-----+------+\n", "\n" ] } ], "source": [ "df = df.withColumnRenamed(\"gender\",\"sex\")\n", "df.show()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "rindol0YmLur" }, "source": [ "### Filtri" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 153 }, "colab_type": "code", "executionInfo": { "elapsed": 1274, "status": "ok", "timestamp": 1592469810758, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "7oWU2EQckxab", "outputId": "60380188-dd0b-44d8-9c97-7e1145585ea8" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+---+---+------+------+-----+------+\n", "| name|sex|age|height|weight| bmi|is_fat|\n", "+--------+---+---+------+------+-----+------+\n", "|Gianluca| M| 23| 1.74| 70.5|23.29| false|\n", "| Andrea| M| 37| 1.79| 68.0|21.22| false|\n", "| Marco| M| 33| 1.72| 88.5|29.91| true|\n", "+--------+---+---+------+------+-----+------+\n", "\n" ] } ], "source": [ "df_male = df.filter(\"sex == 'M'\")\n", "df_male.show()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "tv0-rXVfmQz8" }, "source": [ "oppure..." ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 153 }, "colab_type": "code", "executionInfo": { "elapsed": 849, "status": "ok", "timestamp": 1592469827022, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "c2zTdLYRmKM2", "outputId": "dca1d869-58ae-4cae-c9d2-6b862aa2d57e" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+---+---+------+------+-----+------+\n", "| name|sex|age|height|weight| bmi|is_fat|\n", "+--------+---+---+------+------+-----+------+\n", "|Gianluca| M| 23| 1.74| 70.5|23.29| false|\n", "| Andrea| M| 37| 1.79| 68.0|21.22| false|\n", "| Marco| M| 33| 1.72| 88.5|29.91| true|\n", "+--------+---+---+------+------+-----+------+\n", "\n" ] } ], "source": [ "df_male = df.filter(df[\"sex\"] == 'M')\n", "df_male.show()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "6bNjhJonmUPn" }, "source": [ "### Aggregazione" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "executionInfo": { "elapsed": 1043, "status": "ok", "timestamp": 1592469846071, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "lZfYl1XlmKKM", "outputId": "f83d800d-9162-4c3b-aa70-2b5fd50f1d1d" }, "outputs": [ { "data": { "text/plain": [ "pyspark.sql.group.GroupedData" ] }, "execution_count": 33, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "df_group = df.groupBy('sex')\n", "type(df_group)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "HwyprPN7m6vr" }, "source": [ "Il risultato sarà un'oggetto GroupedData, sulla quale possiamo eseguire diverse operazioni aritmetiche e statistiche, come conteggio" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 136 }, "colab_type": "code", "executionInfo": { "elapsed": 2937, "status": "ok", "timestamp": 1592469999921, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "LdMSnbTimKIt", "outputId": "b26b70e3-fc40-427c-b0d3-e38f8cef0249" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---+-----+\n", "|sex|count|\n", "+---+-----+\n", "| F| 2|\n", "| M| 3|\n", "+---+-----+\n", "\n" ] } ], "source": [ "df_group.count().show()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 136 }, "colab_type": "code", "executionInfo": { "elapsed": 2714, "status": "ok", "timestamp": 1592470012222, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "Pu1jAq-RmKF5", "outputId": "5ea0719a-5d6e-4aae-ca6e-67bae607e173" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---+--------+-----------+-----------------+-----------------+\n", "|sex|avg(age)|avg(height)| avg(weight)| avg(bmi)|\n", "+---+--------+-----------+-----------------+-----------------+\n", "| F| 31.5| 1.6| 52.25| 20.415|\n", "| M| 31.0| 1.75|75.66666666666667|24.80666666666667|\n", "+---+--------+-----------+-----------------+-----------------+\n", "\n" ] } ], "source": [ "df_group.mean().show()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 136 }, "colab_type": "code", "executionInfo": { "elapsed": 1914, "status": "ok", "timestamp": 1592470023794, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "o5hmqAXKmKDo", "outputId": "c30b8c15-2861-4a68-fe41-253e8f74c9e0" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---+--------+-----------+-----------+--------+\n", "|sex|sum(age)|sum(height)|sum(weight)|sum(bmi)|\n", "+---+--------+-----------+-----------+--------+\n", "| F| 63| 3.2| 104.5| 40.83|\n", "| M| 93| 5.25| 227.0| 74.42|\n", "+---+--------+-----------+-----------+--------+\n", "\n" ] } ], "source": [ "df_group.sum().show()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 136 }, "colab_type": "code", "executionInfo": { "elapsed": 1889, "status": "ok", "timestamp": 1592470033356, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "YomUhEXVmKBe", "outputId": "783c3bc1-7e09-4721-c89a-ed3e933507ad" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---+--------+-----------+-----------+--------+\n", "|sex|max(age)|max(height)|max(weight)|max(bmi)|\n", "+---+--------+-----------+-----------+--------+\n", "| F| 38| 1.65| 54.3| 20.89|\n", "| M| 37| 1.79| 88.5| 29.91|\n", "+---+--------+-----------+-----------+--------+\n", "\n" ] } ], "source": [ "df_group.max().show()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 136 }, "colab_type": "code", "executionInfo": { "elapsed": 1779, "status": "ok", "timestamp": 1592470040174, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "9TqA1lWBmJ_x", "outputId": "6f396303-fa68-4283-eeb5-af8c4b36912c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---+--------+-----------+-----------+--------+\n", "|sex|min(age)|min(height)|min(weight)|min(bmi)|\n", "+---+--------+-----------+-----------+--------+\n", "| F| 25| 1.55| 50.2| 19.94|\n", "| M| 23| 1.72| 68.0| 21.22|\n", "+---+--------+-----------+-----------+--------+\n", "\n" ] } ], "source": [ "df_group.min().show()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "xZ3NFs9inJ_D" }, "source": [ "Possiamo anche operare su singole colonne usando il metodo .agg(op) del Dataframe, che prende come parametro un dizionario contenente nome della colonna e operazione da eseguire." ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 119 }, "colab_type": "code", "executionInfo": { "elapsed": 714, "status": "ok", "timestamp": 1592470059644, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "iy_iECRRmJ9J", "outputId": "b668922f-7fa0-4180-8a7a-895b886b1512" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-----------+\n", "|sum(weight)|\n", "+-----------+\n", "| 331.5|\n", "+-----------+\n", "\n" ] } ], "source": [ "df.agg({'weight':'sum'}).show()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 136 }, "colab_type": "code", "executionInfo": { "elapsed": 2125, "status": "ok", "timestamp": 1592470068755, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "BGuGMFQcmJ6m", "outputId": "539f27eb-3fbf-4748-e622-24f9d6ce8c5e" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---+----------+-----------+-----------+\n", "|sex|count(sex)|sum(weight)|max(height)|\n", "+---+----------+-----------+-----------+\n", "| F| 2| 104.5| 1.65|\n", "| M| 3| 227.0| 1.79|\n", "+---+----------+-----------+-----------+\n", "\n" ] } ], "source": [ "df_group.agg({'weight':'sum', 'height':'max', 'sex':'count'}).show()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "3ybMtddnnSOx" }, "source": [ "il nome delle nuove colonne viene assegnato automaticamente, in base alla funzione ed alla colonna che abbiamo utilizzato, possiamo modificare tali nomi usando il metodo .withColumnRenamed(old_name, new_name)." ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 136 }, "colab_type": "code", "executionInfo": { "elapsed": 1517, "status": "ok", "timestamp": 1592470130205, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "WbbWZO7LnOg4", "outputId": "29edd63d-23b8-4304-a6e9-05e38788cbb2" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---+---------+----------+----------+\n", "|sex|count_sex|sum_weight|max_height|\n", "+---+---------+----------+----------+\n", "| F| 2| 104.5| 1.65|\n", "| M| 3| 227.0| 1.79|\n", "+---+---------+----------+----------+\n", "\n" ] } ], "source": [ "df_group.agg({'weight':'sum', 'height':'max', 'sex':'count'})\\\n", " .withColumnRenamed(\"count(sex)\",\"count_sex\")\\\n", " .withColumnRenamed(\"sum(weight)\",\"sum_weight\")\\\n", " .withColumnRenamed(\"max(height)\",\"max_height\").show()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "mZRlsYAXnhqs" }, "source": [ "Piuttosto che un dizionario, possiamo anche utilizzare delle funzioni.\n" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 136 }, "colab_type": "code", "executionInfo": { "elapsed": 1725, "status": "ok", "timestamp": 1592470160798, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "DWNFjvZInOdw", "outputId": "9c8c1448-8131-4eb9-bdd9-0a20e12702ed" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---+-----------+-----------+----------+\n", "|sex|sum(weight)|max(height)|count(sex)|\n", "+---+-----------+-----------+----------+\n", "| F| 104.5| 1.65| 2|\n", "| M| 227.0| 1.79| 3|\n", "+---+-----------+-----------+----------+\n", "\n" ] } ], "source": [ "from pyspark.sql.functions import sum, max, count\n", "\n", "df_group.agg(sum(\"weight\"), max('height'), count('sex')).show()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "wsfh2dqknl04" }, "source": [ "In questo caso per settare arbitrariamente i nomi delle colonne possiamo creare un alias" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 136 }, "colab_type": "code", "executionInfo": { "elapsed": 1564, "status": "ok", "timestamp": 1592470176862, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "VCCv1MO5nOa2", "outputId": "25d20591-9f58-498f-d990-71da9189be51" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---+----------+----------+---------+\n", "|sex|sum_weight|max_height|count_sex|\n", "+---+----------+----------+---------+\n", "| F| 104.5| 1.65| 2|\n", "| M| 227.0| 1.79| 3|\n", "+---+----------+----------+---------+\n", "\n" ] } ], "source": [ "from pyspark.sql.functions import sum, max, count\n", "\n", "df_group.agg(sum(\"weight\").alias(\"sum_weight\"), max('height').alias(\"max_height\"), count('sex').alias(\"count_sex\")).show()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "fKVPMLdFnsG5" }, "source": [ "### Ordinamento" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "NxumwMTHnuxW" }, "source": [ "Per ordinare un Dataframe possiamo utilizzare il metodo .orderBy(col), ad esempio ordiniamo in base al peso." ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 187 }, "colab_type": "code", "executionInfo": { "elapsed": 765, "status": "ok", "timestamp": 1592470209332, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "DLXFW6-DnOYW", "outputId": "409868f5-83e7-4c8e-969f-d867f2374740" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+---+---+------+------+-----+------+\n", "| name|sex|age|height|weight| bmi|is_fat|\n", "+--------+---+---+------+------+-----+------+\n", "|Annalisa| F| 38| 1.55| 50.2|20.89| false|\n", "| Monica| F| 25| 1.65| 54.3|19.94| false|\n", "| Andrea| M| 37| 1.79| 68.0|21.22| false|\n", "|Gianluca| M| 23| 1.74| 70.5|23.29| false|\n", "| Marco| M| 33| 1.72| 88.5|29.91| true|\n", "+--------+---+---+------+------+-----+------+\n", "\n" ] } ], "source": [ "df.orderBy(\"weight\").show()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "dCenAI6iny3f" }, "source": [ "Di default l'ordinamento viene eseguito in maniera ascendente (dal valore minore al valore maggiore), per eseguirlo in maniera discendente ci basta impostare il parametro ascending a False." ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 187 }, "colab_type": "code", "executionInfo": { "elapsed": 521, "status": "ok", "timestamp": 1592470226589, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "Cf6WUd6rnOWM", "outputId": "f6dbc6b6-71d8-42ab-d72e-dc9a54a056d9" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+---+---+------+------+-----+------+\n", "| name|sex|age|height|weight| bmi|is_fat|\n", "+--------+---+---+------+------+-----+------+\n", "| Marco| M| 33| 1.72| 88.5|29.91| true|\n", "|Gianluca| M| 23| 1.74| 70.5|23.29| false|\n", "| Andrea| M| 37| 1.79| 68.0|21.22| false|\n", "| Monica| F| 25| 1.65| 54.3|19.94| false|\n", "|Annalisa| F| 38| 1.55| 50.2|20.89| false|\n", "+--------+---+---+------+------+-----+------+\n", "\n" ] } ], "source": [ "df.orderBy(\"weight\", ascending=False).show()" ] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "pyspark_dataframe.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 1 }