{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "zqow_NYRy524"
},
"source": [
"
"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "ZFTjaQKFy-Mp"
},
"source": [
"## Pyspark"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 782,
"status": "ok",
"timestamp": 1592474394951,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "jatRvzOYy0eI"
},
"outputs": [],
"source": [
"################ template to run PySpark on Colab #######################"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 25026,
"status": "ok",
"timestamp": 1592474419209,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "ejBgn45wy0bv"
},
"outputs": [],
"source": [
"!apt-get install openjdk-8-jdk-headless -qq > /dev/null\n",
"!wget -q https://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz\n",
"!tar xf spark-2.4.5-bin-hadoop2.7.tgz\n",
"!pip install -q findspark"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 25015,
"status": "ok",
"timestamp": 1592474419211,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "-Dvb5iHFy0Yo"
},
"outputs": [],
"source": [
"import os\n",
"os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-8-openjdk-amd64\"\n",
"os.environ[\"SPARK_HOME\"] = \"/content/spark-2.4.5-bin-hadoop2.7\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 30612,
"status": "ok",
"timestamp": 1592474424825,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "bP9gE_cLy0V0"
},
"outputs": [],
"source": [
"import findspark\n",
"findspark.init()\n",
"\n",
"from pyspark.sql import SparkSession\n",
"spark = SparkSession.builder.master(\"local[*]\").getOrCreate()\n",
"spark1 = SparkSession.builder.appName('basic').getOrCreate()\n",
"#Test must give no error"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 30601,
"status": "ok",
"timestamp": 1592474424826,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "3TYtP_lWy0Tf"
},
"outputs": [],
"source": [
"import pyspark"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 30588,
"status": "ok",
"timestamp": 1592474424828,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "BgBi67Puy0Rk"
},
"outputs": [],
"source": [
"from pyspark import SparkConf, SparkContext\n",
"conf = SparkConf().setAppName(\"basic\").setMaster(\"local\")\n",
"#sc = SparkContext(conf=conf) ## for jupyter and Databricks\n",
"sc = SparkContext.getOrCreate() ## for Colab"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 30572,
"status": "ok",
"timestamp": 1592474424828,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "exXSKN8Ly0O2"
},
"outputs": [],
"source": [
"from pyspark.sql.types import *"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"colab_type": "code",
"executionInfo": {
"elapsed": 32369,
"status": "ok",
"timestamp": 1592474426639,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "1ZSwMafky0MN",
"outputId": "c24891ec-f738-40c9-db19-e345ff689b32"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2020-06-18 10:00:25-- https://frenzy86.s3.eu-west-2.amazonaws.com/fav/tecno/BostonHousing.csv\n",
"Resolving frenzy86.s3.eu-west-2.amazonaws.com (frenzy86.s3.eu-west-2.amazonaws.com)... 52.95.149.38\n",
"Connecting to frenzy86.s3.eu-west-2.amazonaws.com (frenzy86.s3.eu-west-2.amazonaws.com)|52.95.149.38|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 35735 (35K) [application/vnd.ms-excel]\n",
"Saving to: ‘BostonHousing.csv.1’\n",
"\n",
"BostonHousing.csv.1 100%[===================>] 34.90K --.-KB/s in 0.1s \n",
"\n",
"2020-06-18 10:00:26 (273 KB/s) - ‘BostonHousing.csv.1’ saved [35735/35735]\n",
"\n"
]
}
],
"source": [
"!wget https://frenzy86.s3.eu-west-2.amazonaws.com/fav/tecno/BostonHousing.csv"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"colab_type": "code",
"executionInfo": {
"elapsed": 40749,
"status": "ok",
"timestamp": 1592474435039,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "aAW5gKCpzwye",
"outputId": "287dbfe8-2652-4c58-d88b-000cff04b761"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+\n",
"| crim| zn|indus|chas| nox| rm| age| dis|rad|tax|ptratio| b|lstat|medv|\n",
"+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+\n",
"|0.00632|18.0| 2.31| 0|0.538|6.575|65.2| 4.09| 1|296| 15.3| 396.9| 4.98|24.0|\n",
"|0.02731| 0.0| 7.07| 0|0.469|6.421|78.9|4.9671| 2|242| 17.8| 396.9| 9.14|21.6|\n",
"|0.02729| 0.0| 7.07| 0|0.469|7.185|61.1|4.9671| 2|242| 17.8|392.83| 4.03|34.7|\n",
"|0.03237| 0.0| 2.18| 0|0.458|6.998|45.8|6.0622| 3|222| 18.7|394.63| 2.94|33.4|\n",
"|0.06905| 0.0| 2.18| 0|0.458|7.147|54.2|6.0622| 3|222| 18.7| 396.9| 5.33|36.2|\n",
"+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+\n",
"only showing top 5 rows\n",
"\n"
]
}
],
"source": [
"housing_df = spark.read.csv(\"BostonHousing.csv\", inferSchema=True, header=True)\n",
"housing_df.show(5)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "3ab_gNTvyr5-"
},
"source": [
"## Boston Dataset\n",
"\n",
"**CRIM** Tasso di criminalità per capita
\n",
"**ZN** Percentuale di terreni residenziali suddivisi in zone per lotti superiori a 25.000 sq.ft.
\n",
"**INDUS** Percentuale di ettari di attività non al dettaglio per città.
\n",
"**CHAS** Variabile dummy che indica la prossimità al fiume Charles.
\n",
"**NOX** Concentrazione di ossido d'azoto (parti per 10 milioni).
\n",
"**RM** Numero medio di stanze per abitazione
\n",
"**AGE** Percentuale di abitazione occupate costruite dopo il 1940
\n",
"**DIS** Media pesata delle distanze da 5 centri lavorativi di Boston.
\n",
"**RAD** Indice di accessibilità ad autostrade
\n",
"**TAX** Aliquota dell'imposta sulla proprietà a valore pieno in 10.000 USD.
\n",
"**PRATIO** Rapporto studente-insegnante per città.
\n",
"**BLACK** 1000(Bk - 0.63)^2 dove Bk è la percentuale di abitanti di colore per città
\n",
"**LSTAT** Percentuale della popolazione povera
\n",
"**MEDV** Mediana del valore di abitazioni occupate in 1.000 USD.
"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "hSNrm0q6yr6L"
},
"source": [
"##Preprocessing dei dati\n",
"Creiamo una lista con i nomi delle colonne che saranno le features del nostro modello, cioè tutte le colonne meno l'ID e il target (MEDV)."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 40732,
"status": "ok",
"timestamp": 1592474435040,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "UbZ59A5Qyr6L"
},
"outputs": [],
"source": [
"features_cols = housing_df.columns[1:-1]"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "UUxRU0FXyr6O"
},
"source": [
"La classe MLlib richiede che le features si trovino tutte all'interno di un unico vettore su di una colonna, possiamo creare questa rappresentazione utilizzando la classe *VectorAssemlber* di MLlib."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"colab_type": "code",
"executionInfo": {
"elapsed": 40934,
"status": "ok",
"timestamp": 1592474435256,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "7mSavt2A3Agc",
"outputId": "b293ab86-84b9-4e27-89c7-1eb0e0e1931a"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+\n",
"| crim| zn|indus|chas| nox| rm| age| dis|rad|tax|ptratio| b|lstat|medv| features|\n",
"+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+\n",
"|0.00632|18.0| 2.31| 0|0.538|6.575|65.2| 4.09| 1|296| 15.3| 396.9| 4.98|24.0|[18.0,2.31,0.0,0....|\n",
"|0.02731| 0.0| 7.07| 0|0.469|6.421|78.9|4.9671| 2|242| 17.8| 396.9| 9.14|21.6|[0.0,7.07,0.0,0.4...|\n",
"|0.02729| 0.0| 7.07| 0|0.469|7.185|61.1|4.9671| 2|242| 17.8|392.83| 4.03|34.7|[0.0,7.07,0.0,0.4...|\n",
"|0.03237| 0.0| 2.18| 0|0.458|6.998|45.8|6.0622| 3|222| 18.7|394.63| 2.94|33.4|[0.0,2.18,0.0,0.4...|\n",
"|0.06905| 0.0| 2.18| 0|0.458|7.147|54.2|6.0622| 3|222| 18.7| 396.9| 5.33|36.2|[0.0,2.18,0.0,0.4...|\n",
"+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+\n",
"only showing top 5 rows\n",
"\n"
]
}
],
"source": [
"from pyspark.ml.feature import VectorAssembler\n",
"\n",
"assembler = VectorAssembler(inputCols=features_cols, outputCol=\"features\")\n",
"data_df = assembler.transform(housing_df)\n",
"data_df.show(5)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "dlmm6mExyr6S"
},
"source": [
"E' buona norma portare le features in un range di valori comuni, questo processo può velocizzare anche di molto la fase di addestramento. Facciamolo utilizzando la **normalizzazione** che si esegue sottraendo il valore minimo e poi dividendo per la differenza tra valore massimo e valore minimo. Possiamo eseguire la normalizzazione con MLlib usando la classe *MinMaxScaler*."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 224
},
"colab_type": "code",
"executionInfo": {
"elapsed": 42732,
"status": "ok",
"timestamp": 1592474437073,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "pJl3Han6yr6S",
"outputId": "7a8cdcab-71a2-4c10-fdc7-2263458c12c7"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+--------------------+\n",
"| crim| zn|indus|chas| nox| rm| age| dis|rad|tax|ptratio| b|lstat|medv| features| scaled_features|\n",
"+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+--------------------+\n",
"|0.00632|18.0| 2.31| 0|0.538|6.575|65.2| 4.09| 1|296| 15.3| 396.9| 4.98|24.0|[18.0,2.31,0.0,0....|[0.18,0.067815249...|\n",
"|0.02731| 0.0| 7.07| 0|0.469|6.421|78.9|4.9671| 2|242| 17.8| 396.9| 9.14|21.6|[0.0,7.07,0.0,0.4...|[0.0,0.2423020527...|\n",
"|0.02729| 0.0| 7.07| 0|0.469|7.185|61.1|4.9671| 2|242| 17.8|392.83| 4.03|34.7|[0.0,7.07,0.0,0.4...|[0.0,0.2423020527...|\n",
"|0.03237| 0.0| 2.18| 0|0.458|6.998|45.8|6.0622| 3|222| 18.7|394.63| 2.94|33.4|[0.0,2.18,0.0,0.4...|[0.0,0.0630498533...|\n",
"|0.06905| 0.0| 2.18| 0|0.458|7.147|54.2|6.0622| 3|222| 18.7| 396.9| 5.33|36.2|[0.0,2.18,0.0,0.4...|[0.0,0.0630498533...|\n",
"+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+--------------------+\n",
"only showing top 5 rows\n",
"\n"
]
}
],
"source": [
"from pyspark.ml.feature import MinMaxScaler\n",
"\n",
"scaler = MinMaxScaler(inputCol=\"features\", outputCol=\"scaled_features\")\n",
"scaler_model = scaler.fit(data_df)\n",
"data_df = scaler_model.transform(data_df)\n",
"\n",
"data_df.show(5)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "b9iZ1Tn9yr6V"
},
"source": [
"Prossimo passo, dividere il DataFrame con le features preprocessate in due DataFrame, uno per l'addestramento e uno per il testing del modello, possiamo farlo utilizzando il metodo *randomSplit* all'interno della quale dobbiamo passare una lista con la percentuale di osservazioni da assegnare ad ognuno dei DataFrame.
\n",
"Nel nostro caso assegnamo il 70% degli esempi al set di addestramento e il 30% al set di test."
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
},
"colab_type": "code",
"executionInfo": {
"elapsed": 45805,
"status": "ok",
"timestamp": 1592474440175,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "ivzguatuyr6V",
"outputId": "2abb1c09-6538-4ede-edc3-73331fe1339a"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"384 esempi nel train set\n",
"122 esempi nel test set\n"
]
}
],
"source": [
"train_df, test_df = data_df.randomSplit([0.7, 0.3])\n",
"\n",
"print(\"%d esempi nel train set\" % train_df.count())\n",
"print(\"%d esempi nel test set\" % test_df.count())"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "omgjPKqzyr6Y"
},
"source": [
"Ottimo ! Possiamo creare il modello di Regressione Lineare, usiamo la classe *LinearRegression, all'interno del costruttore dovremo passare due parametri:\n",
"* **featuresCol**: il nome della colonna con le features\n",
"* **labelCol**: il nome della colonna con il target"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 45786,
"status": "ok",
"timestamp": 1592474440177,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "CaZZQZ9uyr6Z"
},
"outputs": [],
"source": [
"from pyspark.ml.regression import LinearRegression\n",
"\n",
"lr = LinearRegression(featuresCol=\"scaled_features\", labelCol=\"medv\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "O1NLhjeXyr6b"
},
"source": [
"Avviamo l'addestramento con il metodo *fit*, passando al suo interno il set di addetramento"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 51020,
"status": "ok",
"timestamp": 1592474445424,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "lkFmy82kyr6b"
},
"outputs": [],
"source": [
"model = lr.fit(train_df)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "iTonUqlayr6e"
},
"source": [
"Abbiamo creato il nostro modello ! Ora verifichiamone la qualità testandolo su dati che non ha visto durante l'addestramento, possiamo farlo usando il test set e il metodo *evalualte*."
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 51801,
"status": "ok",
"timestamp": 1592474446218,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "RF8oejl4yr6g"
},
"outputs": [],
"source": [
"evaluation = model.evaluate(test_df)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "NDXAtauYyr6j"
},
"source": [
"Il metodo *evaluate* calcolerà diverse metriche che ci possono aiutare a comprendere la qualità del modello, vediamone alcune."
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "YbKZZdTByr6j"
},
"source": [
"#### MAE - Mean Absolute Error (Errore medio assoluto)\n",
"\n",
"L'errore medio assoluto consiste nella media della somma del valore assoluto degli errori.\n",
"\n",
"$$ MAE = \\frac{\\sum_{i=1}^n |y_i-\\hat{y}_i|}{n} $$"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"executionInfo": {
"elapsed": 51792,
"status": "ok",
"timestamp": 1592474446220,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "3iMMzOCByr6k",
"outputId": "9b1bfcf4-daf6-40e1-9513-1489febb0328"
},
"outputs": [
{
"data": {
"text/plain": [
"3.674155517764444"
]
},
"execution_count": 17,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"evaluation.meanAbsoluteError"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Zprp1oe6yr6m"
},
"source": [
"#### MSE - Mean Squared Error (Errore quadratico assoluto)\n",
"\n",
"L'errore quadratico medio consiste nella media della somma degli errori al quadrato.\n",
"\n",
"$$ MSE = \\frac{\\sum_{i=1}^n (y_i-\\hat{y}_i)^2}{n}$$"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"executionInfo": {
"elapsed": 51779,
"status": "ok",
"timestamp": 1592474446221,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "okLkeNiXyr6n",
"outputId": "576fce44-39e6-45d5-93f3-99b9fda22953"
},
"outputs": [
{
"data": {
"text/plain": [
"28.045312901767556"
]
},
"execution_count": 18,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"evaluation.meanSquaredError"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "lT5MFGh1yr6q"
},
"source": [
"#### RMSE - Root Mean Squared Error (Radice dell'errore quadratico medio)\n",
"\n",
"Il RMSE è la radice dell'errore quadratico medio, questa metrica indica mediamente di quanto il nostro modello si è sbagliato.\n",
"\n",
"$$ RMSE = \\sqrt \\frac{\\sum_{i=1}^n (y_i-\\hat{y}_i)^2}{n}$$"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"executionInfo": {
"elapsed": 51768,
"status": "ok",
"timestamp": 1592474446222,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "Rx3h0r1tyr6q",
"outputId": "88f376de-f32c-4d47-d1cb-f2b4c2607568"
},
"outputs": [
{
"data": {
"text/plain": [
"5.295782558014213"
]
},
"execution_count": 19,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"evaluation.rootMeanSquaredError"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "U0aYeO9zyr6t"
},
"source": [
"#### R2 - Coefficient of determination (Coefficiente di Determinazione)\n",
"\n",
"In pratica R2 (pronuciato R Squared) è una versione standardizzata del MSE che torna un punteggio compreso tra 0 e 1 per il train set, mentre per il test set può assumere anche valori negativi. Essendo una funzione ma di scoring, un suo valore maggiore indica una qualità migliore del modello, il suo valore può essere così interpretato:\n",
"\n",
"* R2_score < 0.3 il modello è inutile.\n",
"* 0.3 < R2_score < 0.5 il modello è scarso.\n",
"* 0.5 < R2_score < 0.7 il modello è discreto.\n",
"* 0.7 < R2_score < 0.9 il modello è buono.\n",
"* 0.9 < R2_score < 1 il modello è ottimo.\n",
"* R2_score = 1 molto probabilmente c'è un errore nel modello.\n",
"\n",
"$$ R^2 = 1-\\frac{RSS}{SST} $$"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "hj56oYZCyr6t"
},
"source": [
"dove RSS è la somma dei quadrati residui:\n",
"$$ RSS = \\sum_{i=1}^{N}(Y_i-\\hat{Y}_i)^2 $$"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "jW3KVs1syr6t"
},
"source": [
"ed SST è la somma dei quadrati totali:\n",
"$$ SST = \\sum_{i=1}^{N}(Y_i-\\bar{Y})^2 $$"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"colab_type": "code",
"executionInfo": {
"elapsed": 51757,
"status": "ok",
"timestamp": 1592474446223,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "PmfaaWZByr6u",
"outputId": "7a696c03-7c87-48b9-c96b-7fcf3d035a0f"
},
"outputs": [
{
"data": {
"text/plain": [
"0.7028886094316555"
]
},
"execution_count": 20,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"evaluation.r2"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "GYMSa2Syyr6y"
},
"source": [
"### Testiamo il Modello con Dataset Reale"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"colab_type": "code",
"executionInfo": {
"elapsed": 53322,
"status": "ok",
"timestamp": 1592474447800,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "jd4oqCDhyr6y",
"outputId": "9b886cfe-b7ea-4a75-e95d-7eacaee437c8"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2020-06-18 10:00:46-- https://frenzy86.s3.eu-west-2.amazonaws.com/fav/tecno/houses.csv\n",
"Resolving frenzy86.s3.eu-west-2.amazonaws.com (frenzy86.s3.eu-west-2.amazonaws.com)... 52.95.148.2\n",
"Connecting to frenzy86.s3.eu-west-2.amazonaws.com (frenzy86.s3.eu-west-2.amazonaws.com)|52.95.148.2|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 700 [application/vnd.ms-excel]\n",
"Saving to: ‘houses.csv.3’\n",
"\n",
"\r",
"houses.csv.3 0%[ ] 0 --.-KB/s \r",
"houses.csv.3 100%[===================>] 700 --.-KB/s in 0s \n",
"\n",
"2020-06-18 10:00:47 (11.3 MB/s) - ‘houses.csv.3’ saved [700/700]\n",
"\n"
]
}
],
"source": [
"!wget https://frenzy86.s3.eu-west-2.amazonaws.com/fav/tecno/houses.csv"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 273
},
"colab_type": "code",
"executionInfo": {
"elapsed": 54427,
"status": "ok",
"timestamp": 1592474448936,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "U351Y3YVyr60",
"outputId": "9d5bac4f-ec3d-44e1-f488-a925be18dabe"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+\n",
"| crim| zn|indus|chas| nox| rm| age| dis|rad|tax|ptratio| black|lstat|\n",
"+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+\n",
"|0.05789|12.5| 6.07| 0|0.409|5.878|21.4| 6.498| 4|345| 18.9|396.21| 8.1|\n",
"|0.13554|12.5| 6.07| 0|0.409|5.594|36.8| 6.498| 4|345| 18.9| 396.9|13.09|\n",
"|0.08826| 0.0|10.81| 0|0.413|6.417| 6.6|5.2873| 4|305| 19.2|383.73| 6.72|\n",
"|0.09164| 0.0|10.81| 0|0.413|6.065| 7.8|5.2873| 4|305| 19.2|390.91| 5.52|\n",
"|0.19539| 0.0|10.81| 0|0.413|6.245| 6.2|5.2873| 4|305| 19.2|377.17| 7.54|\n",
"|0.07896| 0.0|12.83| 0|0.437|6.273| 6.0|4.2515| 5|398| 18.7|394.92| 6.78|\n",
"|0.09512| 0.0|12.83| 0|0.437|6.286|45.0|4.5026| 5|398| 18.7|383.23| 8.94|\n",
"|0.10153| 0.0|12.83| 0|0.437|6.279|74.5|4.0522| 5|398| 18.7|373.66|11.97|\n",
"|0.08707| 0.0|12.83| 0|0.437| 6.14|45.8|4.0905| 5|398| 18.7|386.96|10.27|\n",
"|0.04741| 0.0|11.93| 0|0.573| 6.03|80.8| 2.505| 1|273| 21.0| 396.9| 7.88|\n",
"+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+\n",
"\n"
]
}
],
"source": [
"houses_df = spark.read.csv(\"houses.csv\", inferSchema=True, header=True)\n",
"houses_df.show(10)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "8Oqvz8Ziyr63"
},
"source": [
"Creiamo la colonna con le features."
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 552,
"status": "ok",
"timestamp": 1592474773626,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "0AMZZ9rd4hgE"
},
"outputs": [],
"source": [
"features_cols = housing_df.columns[1:-1]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"colab_type": "code",
"executionInfo": {
"elapsed": 893,
"status": "ok",
"timestamp": 1592474774899,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "WbgmFwRM4ufT",
"outputId": "829b9697-8e9e-4570-b520-4b792d9700ab"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+\n",
"| crim| zn|indus|chas| nox| rm| age| dis|rad|tax|ptratio| b|lstat|medv| features|\n",
"+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+\n",
"|0.00632|18.0| 2.31| 0|0.538|6.575|65.2| 4.09| 1|296| 15.3| 396.9| 4.98|24.0|[18.0,2.31,0.0,0....|\n",
"|0.02731| 0.0| 7.07| 0|0.469|6.421|78.9|4.9671| 2|242| 17.8| 396.9| 9.14|21.6|[0.0,7.07,0.0,0.4...|\n",
"|0.02729| 0.0| 7.07| 0|0.469|7.185|61.1|4.9671| 2|242| 17.8|392.83| 4.03|34.7|[0.0,7.07,0.0,0.4...|\n",
"|0.03237| 0.0| 2.18| 0|0.458|6.998|45.8|6.0622| 3|222| 18.7|394.63| 2.94|33.4|[0.0,2.18,0.0,0.4...|\n",
"|0.06905| 0.0| 2.18| 0|0.458|7.147|54.2|6.0622| 3|222| 18.7| 396.9| 5.33|36.2|[0.0,2.18,0.0,0.4...|\n",
"+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+\n",
"only showing top 5 rows\n",
"\n"
]
}
],
"source": [
"from pyspark.ml.feature import VectorAssembler\n",
"\n",
"assembler = VectorAssembler(inputCols=features_cols, outputCol=\"features\")\n",
"input_df = assembler.transform(housing_df)\n",
"input_df.show(5)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "xaGsrtzYyr65"
},
"source": [
"Applichiamo la normalizzazione, assicurandoci di applicare la stessa trasformazione che abbiamo applicato agli esempi di addestramento. In che modo ? Utilizzando solamente il meotodo *transform* dello stesso oggetto sulla quale abbiamo già eseguito *fit* sui dati di addestramento."
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 224
},
"colab_type": "code",
"executionInfo": {
"elapsed": 864,
"status": "ok",
"timestamp": 1592474776690,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "1_rz1462yr66",
"outputId": "e6fc7506-c5bf-495e-daa9-c668d4a261de"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+--------------------+\n",
"| crim| zn|indus|chas| nox| rm| age| dis|rad|tax|ptratio| b|lstat|medv| features| scaled_features|\n",
"+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+--------------------+\n",
"|0.00632|18.0| 2.31| 0|0.538|6.575|65.2| 4.09| 1|296| 15.3| 396.9| 4.98|24.0|[18.0,2.31,0.0,0....|[0.18,0.067815249...|\n",
"|0.02731| 0.0| 7.07| 0|0.469|6.421|78.9|4.9671| 2|242| 17.8| 396.9| 9.14|21.6|[0.0,7.07,0.0,0.4...|[0.0,0.2423020527...|\n",
"|0.02729| 0.0| 7.07| 0|0.469|7.185|61.1|4.9671| 2|242| 17.8|392.83| 4.03|34.7|[0.0,7.07,0.0,0.4...|[0.0,0.2423020527...|\n",
"|0.03237| 0.0| 2.18| 0|0.458|6.998|45.8|6.0622| 3|222| 18.7|394.63| 2.94|33.4|[0.0,2.18,0.0,0.4...|[0.0,0.0630498533...|\n",
"|0.06905| 0.0| 2.18| 0|0.458|7.147|54.2|6.0622| 3|222| 18.7| 396.9| 5.33|36.2|[0.0,2.18,0.0,0.4...|[0.0,0.0630498533...|\n",
"+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+--------------------+--------------------+\n",
"only showing top 5 rows\n",
"\n"
]
}
],
"source": [
"input_df = scaler_model.transform(input_df)\n",
"input_df.show(5)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Z5oK1AXpyr68"
},
"source": [
"Adesso utilizziamo il meotod *predict* del modello per ottenere la sua predizione, che verrà inserita all'interno di una colonna 'prediction'."
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 310
},
"colab_type": "code",
"executionInfo": {
"elapsed": 509,
"status": "ok",
"timestamp": 1592474777945,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "HxrZIHixyr68",
"outputId": "3c689a4d-8f2f-460f-c243-9025a3554d0f"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+--------------------+------------------+\n",
"| crim| zn|indus|chas| nox| rm| age| dis|rad|tax|ptratio| b|lstat|medv| features| scaled_features| prediction|\n",
"+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+--------------------+------------------+\n",
"|0.00632|18.0| 2.31| 0|0.538|6.575| 65.2| 4.09| 1|296| 15.3| 396.9| 4.98|24.0|[18.0,2.31,0.0,0....|[0.18,0.067815249...|30.391022737957485|\n",
"|0.02731| 0.0| 7.07| 0|0.469|6.421| 78.9|4.9671| 2|242| 17.8| 396.9| 9.14|21.6|[0.0,7.07,0.0,0.4...|[0.0,0.2423020527...|25.123059832928902|\n",
"|0.02729| 0.0| 7.07| 0|0.469|7.185| 61.1|4.9671| 2|242| 17.8|392.83| 4.03|34.7|[0.0,7.07,0.0,0.4...|[0.0,0.2423020527...|31.072444051044197|\n",
"|0.03237| 0.0| 2.18| 0|0.458|6.998| 45.8|6.0622| 3|222| 18.7|394.63| 2.94|33.4|[0.0,2.18,0.0,0.4...|[0.0,0.0630498533...| 29.22011369728967|\n",
"|0.06905| 0.0| 2.18| 0|0.458|7.147| 54.2|6.0622| 3|222| 18.7| 396.9| 5.33|36.2|[0.0,2.18,0.0,0.4...|[0.0,0.0630498533...|28.387542014865595|\n",
"|0.02985| 0.0| 2.18| 0|0.458| 6.43| 58.7|6.0622| 3|222| 18.7|394.12| 5.21|28.7|[0.0,2.18,0.0,0.4...|[0.0,0.0630498533...|25.661083048377893|\n",
"|0.08829|12.5| 7.87| 0|0.524|6.012| 66.6|5.5605| 5|311| 15.2| 395.6|12.43|22.9|[12.5,7.87,0.0,0....|[0.125,0.27162756...|22.785168258862452|\n",
"|0.14455|12.5| 7.87| 0|0.524|6.172| 96.1|5.9505| 5|311| 15.2| 396.9|19.15|27.1|[12.5,7.87,0.0,0....|[0.125,0.27162756...| 18.88146478111009|\n",
"|0.21124|12.5| 7.87| 0|0.524|5.631|100.0|6.0821| 5|311| 15.2|386.63|29.93|16.5|[12.5,7.87,0.0,0....|[0.125,0.27162756...|10.166879188464911|\n",
"|0.17004|12.5| 7.87| 0|0.524|6.004| 85.9|6.5921| 5|311| 15.2|386.71| 17.1|18.9|[12.5,7.87,0.0,0....|[0.125,0.27162756...| 18.54179621600624|\n",
"+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+--------------------+------------------+\n",
"only showing top 10 rows\n",
"\n"
]
}
],
"source": [
"pred_df = model.transform(input_df)\n",
"pred_df.show(10)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "S2MSYQ7Syr6-"
},
"source": [
"Ora rimuoviamo le colonne col le features, il prezzo è rappresentato in $10.000, quindi moltiplichiamo per questa cifra per ottenere il prezzo reale e rinominiamo la colonna 'prediction' in 'estimanted_price'."
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 290
},
"colab_type": "code",
"executionInfo": {
"elapsed": 783,
"status": "ok",
"timestamp": 1592474780248,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "p-dPDe8eyr6_",
"outputId": "6e2ad770-9087-4bce-a680-9c7a92a83679"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+---------------+\n",
"| crim| zn|indus|chas| nox| rm| age| dis|rad|tax|ptratio| b|lstat|medv|estimated_price|\n",
"+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+---------------+\n",
"|0.00632|18.0| 2.31| 0|0.538|6.575| 65.2| 4.09| 1|296| 15.3| 396.9| 4.98|24.0| 303910.23|\n",
"|0.02731| 0.0| 7.07| 0|0.469|6.421| 78.9|4.9671| 2|242| 17.8| 396.9| 9.14|21.6| 251230.6|\n",
"|0.02729| 0.0| 7.07| 0|0.469|7.185| 61.1|4.9671| 2|242| 17.8|392.83| 4.03|34.7| 310724.44|\n",
"|0.03237| 0.0| 2.18| 0|0.458|6.998| 45.8|6.0622| 3|222| 18.7|394.63| 2.94|33.4| 292201.14|\n",
"|0.06905| 0.0| 2.18| 0|0.458|7.147| 54.2|6.0622| 3|222| 18.7| 396.9| 5.33|36.2| 283875.42|\n",
"|0.02985| 0.0| 2.18| 0|0.458| 6.43| 58.7|6.0622| 3|222| 18.7|394.12| 5.21|28.7| 256610.83|\n",
"|0.08829|12.5| 7.87| 0|0.524|6.012| 66.6|5.5605| 5|311| 15.2| 395.6|12.43|22.9| 227851.68|\n",
"|0.14455|12.5| 7.87| 0|0.524|6.172| 96.1|5.9505| 5|311| 15.2| 396.9|19.15|27.1| 188814.65|\n",
"|0.21124|12.5| 7.87| 0|0.524|5.631|100.0|6.0821| 5|311| 15.2|386.63|29.93|16.5| 101668.79|\n",
"|0.17004|12.5| 7.87| 0|0.524|6.004| 85.9|6.5921| 5|311| 15.2|386.71| 17.1|18.9| 185417.96|\n",
"+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+---------------+\n",
"only showing top 10 rows\n",
"\n"
]
}
],
"source": [
"from pyspark.sql.functions import round\n",
"\n",
"pred_df = pred_df.drop(\"features\") \\\n",
" .drop(\"scaled_features\") \\\n",
" .withColumn(\"estimated_price\", round(pred_df[\"prediction\"]*10000, 2)) \\\n",
" .drop(\"prediction\")\n",
"\n",
"pred_df.show(10)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"colab": {},
"colab_type": "code",
"executionInfo": {
"elapsed": 584,
"status": "ok",
"timestamp": 1592474699770,
"user": {
"displayName": "T3Lab Vision",
"photoUrl": "",
"userId": "14779383426442114373"
},
"user_tz": -120
},
"id": "A6vqr1Hv43gc"
},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "pyspark_boston.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
},
"name": "linear_regression",
"notebookId": 2246374182813661
},
"nbformat": 4,
"nbformat_minor": 1
}