{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "fr_txRZ1i1_4"
},
"source": [
"
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "fFqeiV3qxso7"
},
"outputs": [],
"source": [
"################ template to run PySpark on Colab #######################"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 35868,
"status": "ok",
"timestamp": 1592469024441,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "m6uGlMRavWlb"
},
"outputs": [],
"source": [
"!apt-get install openjdk-8-jdk-headless -qq > /dev/null\n",
"!wget -q https://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz\n",
"!tar xf spark-2.4.5-bin-hadoop2.7.tgz\n",
"!pip install -q findspark"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 35291,
"status": "ok",
"timestamp": 1592469024442,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "f75mt5iLvaI3"
},
"outputs": [],
"source": [
"import os\n",
"os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-8-openjdk-amd64\"\n",
"os.environ[\"SPARK_HOME\"] = \"/content/spark-2.4.5-bin-hadoop2.7\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 39386,
"status": "ok",
"timestamp": 1592469029149,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "X1NyKr7qvaGP"
},
"outputs": [],
"source": [
"import findspark\n",
"findspark.init()\n",
"\n",
"from pyspark.sql import SparkSession\n",
"spark = SparkSession.builder.master(\"local[*]\").getOrCreate()\n",
"spark1 = SparkSession.builder.appName('basic').getOrCreate()\n",
"#Test must give no error"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 38615,
"status": "ok",
"timestamp": 1592469029151,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "MXYgwjn4vaAH"
},
"outputs": [],
"source": [
"import pyspark"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 38056,
"status": "ok",
"timestamp": 1592469029153,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "08V9HAqMxm9D"
},
"outputs": [],
"source": [
"################ end template PySpark on Colab ##########################"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 37584,
"status": "ok",
"timestamp": 1592469029154,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "rkhXfoJzCOtN"
},
"outputs": [],
"source": [
"from pyspark import SparkConf, SparkContext\n",
"conf = SparkConf().setAppName(\"basic\").setMaster(\"local\")\n",
"#sc = SparkContext(conf=conf) ## for jupyter and Databricks\n",
"sc = SparkContext.getOrCreate() ## for Colab"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 521,
"status": "ok",
"timestamp": 1592469074827,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "7BYL1qr3jLh-"
},
"outputs": [],
"source": [
"from pyspark.sql.types import *"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "b7hyPjIpjgrq"
},
"source": [
"# Dataframe"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load dataset"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 648
},
"colab_type": "code",
"executionInfo": {
"elapsed": 8313,
"status": "ok",
"timestamp": 1592469116466,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "pOIesB3ijLfW",
"outputId": "b60a5d4c-198b-4ef0-839a-3763450c0ab3"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2020-06-18 08:31:49-- https://frenzy86.s3.eu-west-2.amazonaws.com/fav/tecno/shirts.csv\n",
"Resolving frenzy86.s3.eu-west-2.amazonaws.com (frenzy86.s3.eu-west-2.amazonaws.com)... 52.95.150.54\n",
"Connecting to frenzy86.s3.eu-west-2.amazonaws.com (frenzy86.s3.eu-west-2.amazonaws.com)|52.95.150.54|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 1741 (1.7K) [application/vnd.ms-excel]\n",
"Saving to: ‘shirts.csv’\n",
"\n",
"\r",
"shirts.csv 0%[ ] 0 --.-KB/s \r",
"shirts.csv 100%[===================>] 1.70K --.-KB/s in 0s \n",
"\n",
"2020-06-18 08:31:49 (79.7 MB/s) - ‘shirts.csv’ saved [1741/1741]\n",
"\n",
"+---+------+------+------+\n",
"|_c0|taglia|colore|prezzo|\n",
"+---+------+------+------+\n",
"| 0| S|bianco| 4.99|\n",
"| 1| M|bianco| 19.99|\n",
"| 2| XL|bianco| 12.49|\n",
"| 3| XL|bianco| 14.99|\n",
"| 4| S|bianco| 14.99|\n",
"| 5| S| verde| 7.99|\n",
"| 6| M| verde| 4.99|\n",
"| 7| L| verde| 12.49|\n",
"| 8| XL|bianco| 12.49|\n",
"| 9| M| verde| 19.99|\n",
"| 10| L|bianco| 14.99|\n",
"| 11| XL|bianco| 19.99|\n",
"| 12| M|bianco| 4.99|\n",
"| 13| L|bianco| 7.99|\n",
"| 14| M|bianco| 14.99|\n",
"| 15| XL| rosso| 9.99|\n",
"| 16| S| rosso| 12.49|\n",
"| 17| L|bianco| 7.99|\n",
"| 18| XL|bianco| 4.99|\n",
"| 19| M| verde| 14.99|\n",
"+---+------+------+------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"!wget https://frenzy86.s3.eu-west-2.amazonaws.com/fav/tecno/shirts.csv\n",
"df0 = spark.read.load(\"shirts.csv\", format=\"csv\", sep=\",\", inferSchema=\"true\", header=\"true\")\n",
"df0.show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Ujd-bI4Ijmuc"
},
"source": [
"## Creazione dataframe"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "N3ysjRE7jqYP"
},
"source": [
"Possiamo creare un nuovo Dataframe usando il metodo .createDataFrame(data, names) dell'oggetto SparkSession, questo metodo ha bisogno di due parametri:\n",
"\n",
" Una lista di tuple, in cui ogni tupla corrisponde ad una riga del Dataframe.\n",
" Una lista con i nomi per le colonne\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 488,
"status": "ok",
"timestamp": 1592469193232,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "D2QjoH8cjLdX"
},
"outputs": [],
"source": [
"data = [(\"Gianluca\", \"M\", 23, 174, 70.5),\n",
" (\"Andrea\", \"M\", 37, 179, 68.),\n",
" (\"Marco\", \"M\", 33, 172, 88.5),\n",
" (\"Annalisa\", \"F\", 38, 155, 50.2),\n",
" (\"Monica\", \"F\", 25, 165, 54.3)]\n",
"\n",
"df = spark.createDataFrame(data, [\"name\", \"gender\", \"age\", \"height\",\"weight\"])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 187
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1606,
"status": "ok",
"timestamp": 1592469204952,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "oJdXVTEGjLa7",
"outputId": "63a8e297-aa5e-4d52-b372-1ef9d34e8fd0"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------+------+---+------+------+\n",
"| name|gender|age|height|weight|\n",
"+--------+------+---+------+------+\n",
"|Gianluca| M| 23| 174| 70.5|\n",
"| Andrea| M| 37| 179| 68.0|\n",
"| Marco| M| 33| 172| 88.5|\n",
"|Annalisa| F| 38| 155| 50.2|\n",
"| Monica| F| 25| 165| 54.3|\n",
"+--------+------+---+------+------+\n",
"\n"
]
}
],
"source": [
"df.show()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 170
},
"colab_type": "code",
"executionInfo": {
"elapsed": 698,
"status": "ok",
"timestamp": 1592469213483,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "dMuHKISSjLWn",
"outputId": "106e3ef3-0b99-43d0-d8fb-98f39fdfeabd"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------+------+---+------+------+\n",
"| name|gender|age|height|weight|\n",
"+--------+------+---+------+------+\n",
"|Gianluca| M| 23| 174| 70.5|\n",
"| Andrea| M| 37| 179| 68.0|\n",
"| Marco| M| 33| 172| 88.5|\n",
"+--------+------+---+------+------+\n",
"only showing top 3 rows\n",
"\n"
]
}
],
"source": [
"df.show(3)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"executionInfo": {
"elapsed": 478,
"status": "ok",
"timestamp": 1592469225731,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "SyY-kaRKjLUs",
"outputId": "448d0657-92a7-4ab0-9f15-fb0882b7eed4"
},
"outputs": [
{
"data": {
"text/plain": [
"['name', 'gender', 'age', 'height', 'weight']"
]
},
"execution_count": 13,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "VixPqRkHkC2t"
},
"source": [
"Per stampare lo schema del Dataframe, cioè le informazioni legate ad ogni attributo (nome, tipo, se può essere null), possiamo usare il metodo .printSchema()."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"colab_type": "code",
"executionInfo": {
"elapsed": 514,
"status": "ok",
"timestamp": 1592469244654,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "imckHZ3RjLR8",
"outputId": "bd6587db-8f01-4087-8da3-f2a77e5b23ba"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- name: string (nullable = true)\n",
" |-- gender: string (nullable = true)\n",
" |-- age: long (nullable = true)\n",
" |-- height: long (nullable = true)\n",
" |-- weight: double (nullable = true)\n",
"\n"
]
}
],
"source": [
"df.printSchema()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "s1v_GGtSkHjW"
},
"source": [
"Possiamo visualizzare una serie di informazioni statistiche (count, valore medio, deviazione standard, valore minimo e massimo) usando il metodo .describe(),"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 187
},
"colab_type": "code",
"executionInfo": {
"elapsed": 2104,
"status": "ok",
"timestamp": 1592469273206,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "PP39tJi7jLQB",
"outputId": "3b2d9cb5-f9e4-47a2-b1a7-fcf8d06d8a1c"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------+------+------+-----------------+-----------------+-----------------+\n",
"|summary| name|gender| age| height| weight|\n",
"+-------+------+------+-----------------+-----------------+-----------------+\n",
"| count| 5| 5| 5| 5| 5|\n",
"| mean| null| null| 31.2| 169.0| 66.3|\n",
"| stddev| null| null|6.870225614927067|9.300537618869138|15.13753612712452|\n",
"| min|Andrea| F| 23| 155| 50.2|\n",
"| max|Monica| M| 38| 179| 88.5|\n",
"+-------+------+------+-----------------+-----------------+-----------------+\n",
"\n"
]
}
],
"source": [
"df.describe().show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "0L7HTHWwkNPb"
},
"source": [
"## Modificare lo schema"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "93s1lvaOkUIW"
},
"source": [
"lo schema del Dataframe è stato estratto direttamente dai dati, ma se volessimo definirlo noi ?\n",
"\n",
" age: da long a intero.\n",
" height: da long a intero.\n",
" weight: da double a float.\n",
"\n",
"Possiamo farlo creando uno schema per poi passarlo al metodo .createDataFrame(data, schema)."
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 912,
"status": "ok",
"timestamp": 1592469346823,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "I_oS6Kb_jLNh"
},
"outputs": [],
"source": [
"data_schema = [StructField('name', StringType(), True),\n",
" StructField('gender', StringType(), True),\n",
" StructField('age', IntegerType(), True),\n",
" StructField('height', IntegerType(), True),\n",
" StructField('weight', FloatType(), True)]\n",
" \n",
"schema = StructType(fields=data_schema)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "hf8GySqokjBd"
},
"source": [
"Adesso creiamo il Dataframe, passando i dati e lo schema all'interno del parametro schema."
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 187
},
"colab_type": "code",
"executionInfo": {
"elapsed": 832,
"status": "ok",
"timestamp": 1592469358962,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "1q_XLDFUjKzp",
"outputId": "374784d3-e140-4900-debf-99e599519456"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------+------+---+------+------+\n",
"| name|gender|age|height|weight|\n",
"+--------+------+---+------+------+\n",
"|Gianluca| M| 23| 174| 70.5|\n",
"| Andrea| M| 37| 179| 68.0|\n",
"| Marco| M| 33| 172| 88.5|\n",
"|Annalisa| F| 38| 155| 50.2|\n",
"| Monica| F| 25| 165| 54.3|\n",
"+--------+------+---+------+------+\n",
"\n"
]
}
],
"source": [
"df = spark.createDataFrame(data, schema=schema)\n",
"df.show()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"colab_type": "code",
"executionInfo": {
"elapsed": 680,
"status": "ok",
"timestamp": 1592469386326,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "xg2DX1_Ui2AR",
"outputId": "26fc8b55-4297-4979-f721-b03ed041c2ad"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- name: string (nullable = true)\n",
" |-- gender: string (nullable = true)\n",
" |-- age: integer (nullable = true)\n",
" |-- height: integer (nullable = true)\n",
" |-- weight: float (nullable = true)\n",
"\n"
]
}
],
"source": [
"df.printSchema()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "CIHc38arkpM4"
},
"source": [
"## Righe Colonne"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 102
},
"colab_type": "code",
"executionInfo": {
"elapsed": 880,
"status": "ok",
"timestamp": 1592469409572,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "vHmn7T5vi2AU",
"outputId": "773534b9-8162-42cc-9a6b-2cc3a1110d00"
},
"outputs": [
{
"data": {
"text/plain": [
"[Row(name='Gianluca', gender='M', age=23, height=174, weight=70.5),\n",
" Row(name='Andrea', gender='M', age=37, height=179, weight=68.0),\n",
" Row(name='Marco', gender='M', age=33, height=172, weight=88.5),\n",
" Row(name='Annalisa', gender='F', age=38, height=155, weight=50.20000076293945),\n",
" Row(name='Monica', gender='F', age=25, height=165, weight=54.29999923706055)]"
]
},
"execution_count": 19,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"df.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "ZrN9IQf-kvoy"
},
"source": [
"Per selezionare solo una colonna del Dataframe possiamo usare l'indice o il nome"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"executionInfo": {
"elapsed": 513,
"status": "ok",
"timestamp": 1592469428469,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "3xmAv249kstp",
"outputId": "3520cb27-c19d-47c4-c6cf-5b84ad709de1"
},
"outputs": [
{
"data": {
"text/plain": [
"Column"
]
},
"execution_count": 20,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"df[0]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"executionInfo": {
"elapsed": 633,
"status": "ok",
"timestamp": 1592469439838,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "9veTgBO1kyVT",
"outputId": "d97275ac-157e-495f-fe43-baee3d24275e"
},
"outputs": [
{
"data": {
"text/plain": [
"Column"
]
},
"execution_count": 21,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"df[\"name\"]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 187
},
"colab_type": "code",
"executionInfo": {
"elapsed": 812,
"status": "ok",
"timestamp": 1592469470952,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "vIndgQIjkySq",
"outputId": "87c4e804-af7c-45a4-a71c-7a89c6e80d69"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------+\n",
"| name|\n",
"+--------+\n",
"|Gianluca|\n",
"| Andrea|\n",
"| Marco|\n",
"|Annalisa|\n",
"| Monica|\n",
"+--------+\n",
"\n"
]
}
],
"source": [
"dfName = df.select(\"name\")\n",
"dfName.show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "wM5eRDdBk-cT"
},
"source": [
"Possiamo usare lo stesso metodo per selezionare più colonne, passando una lista di nomi come parametro."
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 187
},
"colab_type": "code",
"executionInfo": {
"elapsed": 754,
"status": "ok",
"timestamp": 1592469491680,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "UQGaar-bkyQu",
"outputId": "cfd27979-6b8a-4d9a-f17b-6c90c65db52f"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------+---+\n",
"| name|age|\n",
"+--------+---+\n",
"|Gianluca| 23|\n",
"| Andrea| 37|\n",
"| Marco| 33|\n",
"|Annalisa| 38|\n",
"| Monica| 25|\n",
"+--------+---+\n",
"\n"
]
}
],
"source": [
"df.select([\"name\",\"age\"]).show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "npibj1dBlCy8"
},
"source": [
"## Creare e modificare colonne"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "x16t8RImlF2D"
},
"source": [
"Possiamo modificare una determinata colonna utilizzando il metodo .withColumn(name, column). alla quale dovremo passare il nome della riga che dovremmo modificare e un oggetto colonna"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 187
},
"colab_type": "code",
"executionInfo": {
"elapsed": 712,
"status": "ok",
"timestamp": 1592469524211,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "o3R_hPAHkyPZ",
"outputId": "a41a6bb0-311f-4ee2-fad9-bf0f99a6b7e2"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------+------+---+------+------+\n",
"| name|gender|age|height|weight|\n",
"+--------+------+---+------+------+\n",
"|Gianluca| M| 23| 1.74| 70.5|\n",
"| Andrea| M| 37| 1.79| 68.0|\n",
"| Marco| M| 33| 1.72| 88.5|\n",
"|Annalisa| F| 38| 1.55| 50.2|\n",
"| Monica| F| 25| 1.65| 54.3|\n",
"+--------+------+---+------+------+\n",
"\n"
]
}
],
"source": [
"df = df.withColumn(\"height\", df[\"height\"]/100)\n",
"df.show()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"colab_type": "code",
"executionInfo": {
"elapsed": 680,
"status": "ok",
"timestamp": 1592469541057,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "fjwgQ0wTkyM_",
"outputId": "e4adc5f2-e370-4923-bf30-2f714293c03f"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- name: string (nullable = true)\n",
" |-- gender: string (nullable = true)\n",
" |-- age: integer (nullable = true)\n",
" |-- height: double (nullable = true)\n",
" |-- weight: float (nullable = true)\n",
"\n"
]
}
],
"source": [
"df.printSchema()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "nZYD5aMblQmF"
},
"source": [
"Lo schema è stato modificato automaticamente, dato che abbiamo convertito l'altezza da numeri interi a numeri con la virgola. "
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "In501RU7lWD2"
},
"source": [
"l'indice di massa corporea (bmi) di ogni persona:\n",
"$$ bmi = (\\frac{weight}{height})^2 $$\n"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 187
},
"colab_type": "code",
"executionInfo": {
"elapsed": 701,
"status": "ok",
"timestamp": 1592469596192,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "cEjsnEsjkyLW",
"outputId": "674ed155-361f-467b-f45d-f8d788599317"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------+------+---+------+------+------------------+\n",
"| name|gender|age|height|weight| bmi|\n",
"+--------+------+---+------+------+------------------+\n",
"|Gianluca| M| 23| 1.74| 70.5|23.285770907649624|\n",
"| Andrea| M| 37| 1.79| 68.0| 21.22280827689523|\n",
"| Marco| M| 33| 1.72| 88.5| 29.91481882098432|\n",
"|Annalisa| F| 38| 1.55| 50.2| 20.89490146220164|\n",
"| Monica| F| 25| 1.65| 54.3|19.944903301032344|\n",
"+--------+------+---+------+------+------------------+\n",
"\n"
]
}
],
"source": [
"bmi = df[\"weight\"]/(df[\"height\"]**2)\n",
"df = df.withColumn(\"bmi\", bmi)\n",
"df.show()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 187
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1012,
"status": "ok",
"timestamp": 1592469617720,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "P9LBDKB_kyI7",
"outputId": "f378ff01-6aff-4935-f29a-9242072a39ea"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------+------+---+------+------+-----+\n",
"| name|gender|age|height|weight| bmi|\n",
"+--------+------+---+------+------+-----+\n",
"|Gianluca| M| 23| 1.74| 70.5|23.29|\n",
"| Andrea| M| 37| 1.79| 68.0|21.22|\n",
"| Marco| M| 33| 1.72| 88.5|29.91|\n",
"|Annalisa| F| 38| 1.55| 50.2|20.89|\n",
"| Monica| F| 25| 1.65| 54.3|19.94|\n",
"+--------+------+---+------+------+-----+\n",
"\n"
]
}
],
"source": [
"from pyspark.sql.functions import round\n",
"\n",
"df = df.withColumn(\"bmi\", round(df[\"bmi\"], 2))\n",
"df.show()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 187
},
"colab_type": "code",
"executionInfo": {
"elapsed": 810,
"status": "ok",
"timestamp": 1592469643096,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "HI9PP7uNkyIA",
"outputId": "48883ac1-0a4a-4319-b263-1f1e13e4625f"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------+------+---+------+------+-----+------+\n",
"| name|gender|age|height|weight| bmi|is_fat|\n",
"+--------+------+---+------+------+-----+------+\n",
"|Gianluca| M| 23| 1.74| 70.5|23.29| false|\n",
"| Andrea| M| 37| 1.79| 68.0|21.22| false|\n",
"| Marco| M| 33| 1.72| 88.5|29.91| true|\n",
"|Annalisa| F| 38| 1.55| 50.2|20.89| false|\n",
"| Monica| F| 25| 1.65| 54.3|19.94| false|\n",
"+--------+------+---+------+------+-----+------+\n",
"\n"
]
}
],
"source": [
"from pyspark.sql.functions import col, when\n",
"\n",
"df = df.withColumn(\"is_fat\", when(col(\"bmi\")>25, True).otherwise(False))\n",
"df.show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "PKyseaydmHNJ"
},
"source": [
"Per finire rinominiamo la colonna gender in sex, possiamo farlo tramite il metodo .withColumnRenamed(old_name, new_name)."
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 187
},
"colab_type": "code",
"executionInfo": {
"elapsed": 881,
"status": "ok",
"timestamp": 1592469788286,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "kdJr0ytKkyFO",
"outputId": "924e2a0e-a751-4e56-cfef-50efea664706"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------+---+---+------+------+-----+------+\n",
"| name|sex|age|height|weight| bmi|is_fat|\n",
"+--------+---+---+------+------+-----+------+\n",
"|Gianluca| M| 23| 1.74| 70.5|23.29| false|\n",
"| Andrea| M| 37| 1.79| 68.0|21.22| false|\n",
"| Marco| M| 33| 1.72| 88.5|29.91| true|\n",
"|Annalisa| F| 38| 1.55| 50.2|20.89| false|\n",
"| Monica| F| 25| 1.65| 54.3|19.94| false|\n",
"+--------+---+---+------+------+-----+------+\n",
"\n"
]
}
],
"source": [
"df = df.withColumnRenamed(\"gender\",\"sex\")\n",
"df.show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "rindol0YmLur"
},
"source": [
"### Filtri"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 153
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1274,
"status": "ok",
"timestamp": 1592469810758,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "7oWU2EQckxab",
"outputId": "60380188-dd0b-44d8-9c97-7e1145585ea8"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------+---+---+------+------+-----+------+\n",
"| name|sex|age|height|weight| bmi|is_fat|\n",
"+--------+---+---+------+------+-----+------+\n",
"|Gianluca| M| 23| 1.74| 70.5|23.29| false|\n",
"| Andrea| M| 37| 1.79| 68.0|21.22| false|\n",
"| Marco| M| 33| 1.72| 88.5|29.91| true|\n",
"+--------+---+---+------+------+-----+------+\n",
"\n"
]
}
],
"source": [
"df_male = df.filter(\"sex == 'M'\")\n",
"df_male.show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "tv0-rXVfmQz8"
},
"source": [
"oppure..."
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 153
},
"colab_type": "code",
"executionInfo": {
"elapsed": 849,
"status": "ok",
"timestamp": 1592469827022,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "c2zTdLYRmKM2",
"outputId": "dca1d869-58ae-4cae-c9d2-6b862aa2d57e"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------+---+---+------+------+-----+------+\n",
"| name|sex|age|height|weight| bmi|is_fat|\n",
"+--------+---+---+------+------+-----+------+\n",
"|Gianluca| M| 23| 1.74| 70.5|23.29| false|\n",
"| Andrea| M| 37| 1.79| 68.0|21.22| false|\n",
"| Marco| M| 33| 1.72| 88.5|29.91| true|\n",
"+--------+---+---+------+------+-----+------+\n",
"\n"
]
}
],
"source": [
"df_male = df.filter(df[\"sex\"] == 'M')\n",
"df_male.show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "6bNjhJonmUPn"
},
"source": [
"### Aggregazione"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1043,
"status": "ok",
"timestamp": 1592469846071,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "lZfYl1XlmKKM",
"outputId": "f83d800d-9162-4c3b-aa70-2b5fd50f1d1d"
},
"outputs": [
{
"data": {
"text/plain": [
"pyspark.sql.group.GroupedData"
]
},
"execution_count": 33,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"df_group = df.groupBy('sex')\n",
"type(df_group)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "HwyprPN7m6vr"
},
"source": [
"Il risultato sarà un'oggetto GroupedData, sulla quale possiamo eseguire diverse operazioni aritmetiche e statistiche, come conteggio"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"colab_type": "code",
"executionInfo": {
"elapsed": 2937,
"status": "ok",
"timestamp": 1592469999921,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "LdMSnbTimKIt",
"outputId": "b26b70e3-fc40-427c-b0d3-e38f8cef0249"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---+-----+\n",
"|sex|count|\n",
"+---+-----+\n",
"| F| 2|\n",
"| M| 3|\n",
"+---+-----+\n",
"\n"
]
}
],
"source": [
"df_group.count().show()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"colab_type": "code",
"executionInfo": {
"elapsed": 2714,
"status": "ok",
"timestamp": 1592470012222,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "Pu1jAq-RmKF5",
"outputId": "5ea0719a-5d6e-4aae-ca6e-67bae607e173"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---+--------+-----------+-----------------+-----------------+\n",
"|sex|avg(age)|avg(height)| avg(weight)| avg(bmi)|\n",
"+---+--------+-----------+-----------------+-----------------+\n",
"| F| 31.5| 1.6| 52.25| 20.415|\n",
"| M| 31.0| 1.75|75.66666666666667|24.80666666666667|\n",
"+---+--------+-----------+-----------------+-----------------+\n",
"\n"
]
}
],
"source": [
"df_group.mean().show()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1914,
"status": "ok",
"timestamp": 1592470023794,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "o5hmqAXKmKDo",
"outputId": "c30b8c15-2861-4a68-fe41-253e8f74c9e0"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---+--------+-----------+-----------+--------+\n",
"|sex|sum(age)|sum(height)|sum(weight)|sum(bmi)|\n",
"+---+--------+-----------+-----------+--------+\n",
"| F| 63| 3.2| 104.5| 40.83|\n",
"| M| 93| 5.25| 227.0| 74.42|\n",
"+---+--------+-----------+-----------+--------+\n",
"\n"
]
}
],
"source": [
"df_group.sum().show()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1889,
"status": "ok",
"timestamp": 1592470033356,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "YomUhEXVmKBe",
"outputId": "783c3bc1-7e09-4721-c89a-ed3e933507ad"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---+--------+-----------+-----------+--------+\n",
"|sex|max(age)|max(height)|max(weight)|max(bmi)|\n",
"+---+--------+-----------+-----------+--------+\n",
"| F| 38| 1.65| 54.3| 20.89|\n",
"| M| 37| 1.79| 88.5| 29.91|\n",
"+---+--------+-----------+-----------+--------+\n",
"\n"
]
}
],
"source": [
"df_group.max().show()"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1779,
"status": "ok",
"timestamp": 1592470040174,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "9TqA1lWBmJ_x",
"outputId": "6f396303-fa68-4283-eeb5-af8c4b36912c"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---+--------+-----------+-----------+--------+\n",
"|sex|min(age)|min(height)|min(weight)|min(bmi)|\n",
"+---+--------+-----------+-----------+--------+\n",
"| F| 25| 1.55| 50.2| 19.94|\n",
"| M| 23| 1.72| 68.0| 21.22|\n",
"+---+--------+-----------+-----------+--------+\n",
"\n"
]
}
],
"source": [
"df_group.min().show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "xZ3NFs9inJ_D"
},
"source": [
"Possiamo anche operare su singole colonne usando il metodo .agg(op) del Dataframe, che prende come parametro un dizionario contenente nome della colonna e operazione da eseguire."
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 119
},
"colab_type": "code",
"executionInfo": {
"elapsed": 714,
"status": "ok",
"timestamp": 1592470059644,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "iy_iECRRmJ9J",
"outputId": "b668922f-7fa0-4180-8a7a-895b886b1512"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------+\n",
"|sum(weight)|\n",
"+-----------+\n",
"| 331.5|\n",
"+-----------+\n",
"\n"
]
}
],
"source": [
"df.agg({'weight':'sum'}).show()"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"colab_type": "code",
"executionInfo": {
"elapsed": 2125,
"status": "ok",
"timestamp": 1592470068755,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "BGuGMFQcmJ6m",
"outputId": "539f27eb-3fbf-4748-e622-24f9d6ce8c5e"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---+----------+-----------+-----------+\n",
"|sex|count(sex)|sum(weight)|max(height)|\n",
"+---+----------+-----------+-----------+\n",
"| F| 2| 104.5| 1.65|\n",
"| M| 3| 227.0| 1.79|\n",
"+---+----------+-----------+-----------+\n",
"\n"
]
}
],
"source": [
"df_group.agg({'weight':'sum', 'height':'max', 'sex':'count'}).show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "3ybMtddnnSOx"
},
"source": [
"il nome delle nuove colonne viene assegnato automaticamente, in base alla funzione ed alla colonna che abbiamo utilizzato, possiamo modificare tali nomi usando il metodo .withColumnRenamed(old_name, new_name)."
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1517,
"status": "ok",
"timestamp": 1592470130205,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "WbbWZO7LnOg4",
"outputId": "29edd63d-23b8-4304-a6e9-05e38788cbb2"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---+---------+----------+----------+\n",
"|sex|count_sex|sum_weight|max_height|\n",
"+---+---------+----------+----------+\n",
"| F| 2| 104.5| 1.65|\n",
"| M| 3| 227.0| 1.79|\n",
"+---+---------+----------+----------+\n",
"\n"
]
}
],
"source": [
"df_group.agg({'weight':'sum', 'height':'max', 'sex':'count'})\\\n",
" .withColumnRenamed(\"count(sex)\",\"count_sex\")\\\n",
" .withColumnRenamed(\"sum(weight)\",\"sum_weight\")\\\n",
" .withColumnRenamed(\"max(height)\",\"max_height\").show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "mZRlsYAXnhqs"
},
"source": [
"Piuttosto che un dizionario, possiamo anche utilizzare delle funzioni.\n"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1725,
"status": "ok",
"timestamp": 1592470160798,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "DWNFjvZInOdw",
"outputId": "9c8c1448-8131-4eb9-bdd9-0a20e12702ed"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---+-----------+-----------+----------+\n",
"|sex|sum(weight)|max(height)|count(sex)|\n",
"+---+-----------+-----------+----------+\n",
"| F| 104.5| 1.65| 2|\n",
"| M| 227.0| 1.79| 3|\n",
"+---+-----------+-----------+----------+\n",
"\n"
]
}
],
"source": [
"from pyspark.sql.functions import sum, max, count\n",
"\n",
"df_group.agg(sum(\"weight\"), max('height'), count('sex')).show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "wsfh2dqknl04"
},
"source": [
"In questo caso per settare arbitrariamente i nomi delle colonne possiamo creare un alias"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"colab_type": "code",
"executionInfo": {
"elapsed": 1564,
"status": "ok",
"timestamp": 1592470176862,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "VCCv1MO5nOa2",
"outputId": "25d20591-9f58-498f-d990-71da9189be51"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---+----------+----------+---------+\n",
"|sex|sum_weight|max_height|count_sex|\n",
"+---+----------+----------+---------+\n",
"| F| 104.5| 1.65| 2|\n",
"| M| 227.0| 1.79| 3|\n",
"+---+----------+----------+---------+\n",
"\n"
]
}
],
"source": [
"from pyspark.sql.functions import sum, max, count\n",
"\n",
"df_group.agg(sum(\"weight\").alias(\"sum_weight\"), max('height').alias(\"max_height\"), count('sex').alias(\"count_sex\")).show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "fKVPMLdFnsG5"
},
"source": [
"### Ordinamento"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "NxumwMTHnuxW"
},
"source": [
"Per ordinare un Dataframe possiamo utilizzare il metodo .orderBy(col), ad esempio ordiniamo in base al peso."
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 187
},
"colab_type": "code",
"executionInfo": {
"elapsed": 765,
"status": "ok",
"timestamp": 1592470209332,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "DLXFW6-DnOYW",
"outputId": "409868f5-83e7-4c8e-969f-d867f2374740"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------+---+---+------+------+-----+------+\n",
"| name|sex|age|height|weight| bmi|is_fat|\n",
"+--------+---+---+------+------+-----+------+\n",
"|Annalisa| F| 38| 1.55| 50.2|20.89| false|\n",
"| Monica| F| 25| 1.65| 54.3|19.94| false|\n",
"| Andrea| M| 37| 1.79| 68.0|21.22| false|\n",
"|Gianluca| M| 23| 1.74| 70.5|23.29| false|\n",
"| Marco| M| 33| 1.72| 88.5|29.91| true|\n",
"+--------+---+---+------+------+-----+------+\n",
"\n"
]
}
],
"source": [
"df.orderBy(\"weight\").show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "dCenAI6iny3f"
},
"source": [
"Di default l'ordinamento viene eseguito in maniera ascendente (dal valore minore al valore maggiore), per eseguirlo in maniera discendente ci basta impostare il parametro ascending a False."
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 187
},
"colab_type": "code",
"executionInfo": {
"elapsed": 521,
"status": "ok",
"timestamp": 1592470226589,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "Cf6WUd6rnOWM",
"outputId": "f6dbc6b6-71d8-42ab-d72e-dc9a54a056d9"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------+---+---+------+------+-----+------+\n",
"| name|sex|age|height|weight| bmi|is_fat|\n",
"+--------+---+---+------+------+-----+------+\n",
"| Marco| M| 33| 1.72| 88.5|29.91| true|\n",
"|Gianluca| M| 23| 1.74| 70.5|23.29| false|\n",
"| Andrea| M| 37| 1.79| 68.0|21.22| false|\n",
"| Monica| F| 25| 1.65| 54.3|19.94| false|\n",
"|Annalisa| F| 38| 1.55| 50.2|20.89| false|\n",
"+--------+---+---+------+------+-----+------+\n",
"\n"
]
}
],
"source": [
"df.orderBy(\"weight\", ascending=False).show()"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "pyspark_dataframe.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 1
}