{ "cells": [ { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "lmJ97hWzo2zF" }, "source": [ "\"Open" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "Sszrzyybok2v" }, "outputs": [], "source": [ "################ template to run PySpark on Colab #######################" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 38220, "status": "ok", "timestamp": 1592471165708, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "0WGeMNr-ok5y" }, "outputs": [], "source": [ "!apt-get install openjdk-8-jdk-headless -qq > /dev/null\n", "!wget -q https://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz\n", "!tar xf spark-2.4.5-bin-hadoop2.7.tgz\n", "!pip install -q findspark" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 37863, "status": "ok", "timestamp": 1592471165709, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "S33Pv2YDokyN" }, "outputs": [], "source": [ "import os\n", "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-8-openjdk-amd64\"\n", "os.environ[\"SPARK_HOME\"] = \"/content/spark-2.4.5-bin-hadoop2.7\"" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 42456, "status": "ok", "timestamp": 1592471170604, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "VIQewOA8oktH" }, "outputs": [], "source": [ "import findspark\n", "findspark.init()\n", "\n", "from pyspark.sql import SparkSession\n", "spark = SparkSession.builder.master(\"local[*]\").getOrCreate()\n", "spark1 = SparkSession.builder.appName('basic').getOrCreate()\n", "#Test must give no error" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 42140, "status": "ok", "timestamp": 1592471170605, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "0VyUq14yokoz" }, "outputs": [], "source": [ "import pyspark" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 39102, "status": "ok", "timestamp": 1592471170606, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "CAbqiCrDokkX" }, "outputs": [], "source": [ "from pyspark import SparkConf, SparkContext\n", "conf = SparkConf().setAppName(\"basic\").setMaster(\"local\")\n", "#sc = SparkContext(conf=conf) ## for jupyter and Databricks\n", "sc = SparkContext.getOrCreate() ## for Colab" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 38630, "status": "ok", "timestamp": 1592471170607, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "N55X_8Rgokia" }, "outputs": [], "source": [ "from pyspark.sql.types import *" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "2h--i1EMq2YJ" }, "source": [ "# Analisi recensioni film\n", "In questo notebook utilizzeremo Spark con il modulo SparkSQL ed un Dataframe per analizzare oltre 28 milioni di recensioni di film. Nello specifico le domande alla quale cerchermo di dare una risposta sono le seguenti:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "colab_type": "code", "executionInfo": { "elapsed": 8283, "status": "ok", "timestamp": 1592471065605, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "8qQ6ThrkoGVt", "outputId": "02d1b89e-be4c-46aa-f95c-1dcbdfad7c31" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2020-06-18 09:04:19-- http://files.grouplens.org/datasets/movielens/ml-latest.zip\n", "Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152\n", "Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 277113433 (264M) [application/zip]\n", "Saving to: ‘ml-latest.zip’\n", "\n", "ml-latest.zip 100%[===================>] 264.28M 54.8MB/s in 5.1s \n", "\n", "2020-06-18 09:04:24 (51.6 MB/s) - ‘ml-latest.zip’ saved [277113433/277113433]\n", "\n" ] } ], "source": [ "!wget http://files.grouplens.org/datasets/movielens/ml-latest.zip" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 170 }, "colab_type": "code", "executionInfo": { "elapsed": 14745, "status": "ok", "timestamp": 1592471081854, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "IgtM4cu1qt2x", "outputId": "373b9305-c859-4bdb-90d2-f6b3a1b6f778" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Archive: ml-latest.zip\n", " creating: ml-latest/\n", " inflating: ml-latest/links.csv \n", " inflating: ml-latest/tags.csv \n", " inflating: ml-latest/genome-tags.csv \n", " inflating: ml-latest/ratings.csv \n", " inflating: ml-latest/README.txt \n", " inflating: ml-latest/genome-scores.csv \n", " inflating: ml-latest/movies.csv \n" ] } ], "source": [ "!unzip ml-latest.zip" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 51 }, "colab_type": "code", "executionInfo": { "elapsed": 17229, "status": "ok", "timestamp": 1592471084764, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "M9EFzaUxquNO", "outputId": "a929cefb-17e6-4b8a-dd6b-51729b956d22" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "genome-scores.csv links.csv ratings.csv tags.csv\n", "genome-tags.csv movies.csv README.txt\n" ] } ], "source": [ "!ls ml-latest" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "MVhDEYQ0rDao" }, "source": [ "I file che ci interessano sono:\n", "\n", "* ratings.csv: che contiene, per ogni riga, id dell'utente, id del film, valutazione da 1.0 a 5.0 e timestamp.\n", "* movies.csv: che contiene nome e genere dei film associati agli id." ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "E7lRjm3nrMYD" }, "source": [ "## Importiamo il dataset in un Dataframe\n", "Per caricare un csv all'interno di un Dataframe possiamo utilizzare il metodo *.load(filepath, type)*, i principali formati supportati sono csv, json e orc." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "colab_type": "code", "executionInfo": { "elapsed": 6232, "status": "ok", "timestamp": 1592471176852, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "xHIsq0gCquHc", "outputId": "30038955-0ae9-47d8-8f26-5a5c98c3efa9" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+-------+------+----------+\n", "| _c0| _c1| _c2| _c3|\n", "+------+-------+------+----------+\n", "|userId|movieId|rating| timestamp|\n", "| 1| 307| 3.5|1256677221|\n", "| 1| 481| 3.5|1256677456|\n", "| 1| 1091| 1.5|1256677471|\n", "| 1| 1257| 4.5|1256677460|\n", "+------+-------+------+----------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "df = spark.read.load(\"ml-latest/ratings.csv\", format=\"csv\")\n", "df.show(5)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "colab_type": "code", "executionInfo": { "elapsed": 6846, "status": "ok", "timestamp": 1592471177470, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "lVpLfkS-quFG", "outputId": "9b5b246f-b928-4854-d787-eb2e92f858dd" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+-------+------+----------+\n", "| _c0| _c1| _c2| _c3|\n", "+------+-------+------+----------+\n", "|userId|movieId|rating| timestamp|\n", "| 1| 307| 3.5|1256677221|\n", "| 1| 481| 3.5|1256677456|\n", "| 1| 1091| 1.5|1256677471|\n", "| 1| 1257| 4.5|1256677460|\n", "+------+-------+------+----------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "df = spark.read.csv(\"ml-latest/ratings.csv\")\n", "df.show(5)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 119 }, "colab_type": "code", "executionInfo": { "elapsed": 6842, "status": "ok", "timestamp": 1592471177471, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "x9oIbjWcquDF", "outputId": "ffefcf13-4aec-4369-e4d1-cef0d40dfe07" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- _c0: string (nullable = true)\n", " |-- _c1: string (nullable = true)\n", " |-- _c2: string (nullable = true)\n", " |-- _c3: string (nullable = true)\n", "\n" ] } ], "source": [ "df.printSchema()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "qUJvecgHrano" }, "source": [ "I nomi delle colonne, presenti alla prima riga del file, non sono stati riconosciuti, inoltre anche il tipo delle colonne è totalmente sbagliato, per risolvere questi due problemi ci basta utilizzare due parametri:\n", "* **header**: se impostato a True indica al metodo che la prima riga del file contiene i nomi delle colonne.\n", "* **inferSchema**: impostandolo a True il tipo delle colonne verrà rilevato automaticamente." ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 102 }, "colab_type": "code", "executionInfo": { "elapsed": 42599, "status": "ok", "timestamp": 1592471228676, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "Gy3PR5itquAl", "outputId": "4a5829bc-31aa-4992-9bfb-887189ed4cea" }, "outputs": [ { "data": { "text/plain": [ "[Row(userId=1, movieId=307, rating=3.5, timestamp=1256677221),\n", " Row(userId=1, movieId=481, rating=3.5, timestamp=1256677456),\n", " Row(userId=1, movieId=1091, rating=1.5, timestamp=1256677471),\n", " Row(userId=1, movieId=1257, rating=4.5, timestamp=1256677460),\n", " Row(userId=1, movieId=1449, rating=4.5, timestamp=1256677264)]" ] }, "execution_count": 14, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "df = spark.read.csv(\"ml-latest/ratings.csv\", header=True, inferSchema=True)\n", "df.head(5)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "fzhf5lR6rhqh" }, "source": [ "## Correggiamo lo schema\n", "Piuttosto che utilizzare numeri per gli id, usiamo delle stringhe, quindi definiamo uno schema manualmente." ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "colab_type": "code", "executionInfo": { "elapsed": 733, "status": "ok", "timestamp": 1592471230448, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "uCM8jbukqt99", "outputId": "ee9836d3-8bdb-4d82-a9d6-760a09748c34" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+-------+------+----------+\n", "|userID|movieID|rating| timestamp|\n", "+------+-------+------+----------+\n", "| 1| 307| 3.5|1256677221|\n", "| 1| 481| 3.5|1256677456|\n", "| 1| 1091| 1.5|1256677471|\n", "| 1| 1257| 4.5|1256677460|\n", "| 1| 1449| 4.5|1256677264|\n", "+------+-------+------+----------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "data_schema = [StructField('userID', StringType(), True),\n", " StructField('movieID', StringType(), True),\n", " StructField('rating', FloatType(), True),\n", " StructField('timestamp', IntegerType(), True)]\n", " \n", "schema = StructType(fields=data_schema)\n", "\n", "df = spark.read.schema(schema).option(\"header\",\"true\").option(\"inferSchema\",\"false\").csv(\"ml-latest/ratings.csv\")\n", "df.show(5)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "cJFA0aTtrrT0" }, "source": [ "Il timestamp è in formato UNIX, convertiamo in una data utilizzando le funzioni from_unix_time(unix_time) e to_date(time) di spark." ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 460 }, "colab_type": "code", "executionInfo": { "elapsed": 843, "status": "ok", "timestamp": 1592471255020, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "N3zAdVLDqt7Z", "outputId": "44e6987e-c5f4-4582-fc9f-22abbf80dfae" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+-------+------+----------+\n", "|userID|movieID|rating| timestamp|\n", "+------+-------+------+----------+\n", "| 1| 307| 3.5|2009-10-27|\n", "| 1| 481| 3.5|2009-10-27|\n", "| 1| 1091| 1.5|2009-10-27|\n", "| 1| 1257| 4.5|2009-10-27|\n", "| 1| 1449| 4.5|2009-10-27|\n", "| 1| 1590| 2.5|2009-10-27|\n", "| 1| 1591| 1.5|2009-10-27|\n", "| 1| 2134| 4.5|2009-10-27|\n", "| 1| 2478| 4.0|2009-10-27|\n", "| 1| 2840| 3.0|2009-10-27|\n", "| 1| 2986| 2.5|2009-10-27|\n", "| 1| 3020| 4.0|2009-10-27|\n", "| 1| 3424| 4.5|2009-10-27|\n", "| 1| 3698| 3.5|2009-10-27|\n", "| 1| 3826| 2.0|2009-10-27|\n", "| 1| 3893| 3.5|2009-10-27|\n", "| 2| 170| 3.5|2007-10-20|\n", "| 2| 849| 3.5|2007-10-20|\n", "| 2| 1186| 3.5|2007-10-20|\n", "| 2| 1235| 3.0|2007-10-20|\n", "+------+-------+------+----------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "from pyspark.sql.functions import from_unixtime , to_date\n", "\n", "df.withColumn('timestamp', to_date(from_unixtime(df[\"timestamp\"]))).show()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "colab_type": "code", "executionInfo": { "elapsed": 749, "status": "ok", "timestamp": 1592471267858, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "YuwIt1CVrvRf", "outputId": "b5c591ed-21a1-43e2-b765-781dfcbf7503" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+-------+------+-------------------+\n", "|userID|movieID|rating| timestamp|\n", "+------+-------+------+-------------------+\n", "| 1| 307| 3.5|2009-10-27 21:00:21|\n", "| 1| 481| 3.5|2009-10-27 21:04:16|\n", "| 1| 1091| 1.5|2009-10-27 21:04:31|\n", "| 1| 1257| 4.5|2009-10-27 21:04:20|\n", "| 1| 1449| 4.5|2009-10-27 21:01:04|\n", "+------+-------+------+-------------------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "from pyspark.sql.functions import to_utc_timestamp\n", "\n", "df = df.withColumn('timestamp', to_utc_timestamp(from_unixtime(df[\"timestamp\"]), \"yyyy-MM-dd hh:mm:ss\"))\n", "df.show(5)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "xgaHvFMer0mE" }, "source": [ "Ora abbiamo sia data che ora, diamo uno sguardo allo schema." ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 119 }, "colab_type": "code", "executionInfo": { "elapsed": 610, "status": "ok", "timestamp": 1592471281252, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "faYhh-virzZL", "outputId": "a1887912-4858-456c-a9d0-a3629db2d0c1" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- userID: string (nullable = true)\n", " |-- movieID: string (nullable = true)\n", " |-- rating: float (nullable = true)\n", " |-- timestamp: timestamp (nullable = true)\n", "\n" ] } ], "source": [ "df.printSchema()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "rIwhxt1lr4ll" }, "source": [ "## Contiamo le recensioni nel dataset\n", "Per contare il numero totale di recenioni possiamo semplicemente utilizzare il metodo *.count()*." ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "executionInfo": { "elapsed": 12637, "status": "ok", "timestamp": 1592471316823, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "sqHQaEVwrzTL", "outputId": "e79dcba4-ae2f-4655-9d7a-fae1372dc914" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "27753444\n" ] } ], "source": [ "total_reviews = df.count()\n", "print(total_reviews)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "Ico310xGsBLQ" }, "source": [ "## Contare il numero medio di recensioni per utente\n", "Quante recensioni ha scritto in media un'utente ? Per saperlo dobbiamo innanziatutto conoscere il numero di recensori unici all'interno del dataset, possiamo farlo utilizzando la funzione *countDisctinct(col)* " ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 119 }, "colab_type": "code", "executionInfo": { "elapsed": 36324, "status": "ok", "timestamp": 1592471388154, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "IZgG53IYrzRB", "outputId": "b18a307e-9d5c-432b-bf6e-7253a4616127" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---------------+\n", "|count_reviewers|\n", "+---------------+\n", "| 283228|\n", "+---------------+\n", "\n" ] } ], "source": [ "from pyspark.sql.functions import col, countDistinct\n", "\n", "total_unique_reviewers = df.agg(countDistinct(\"userID\").alias(\"count_reviewers\"))\n", "total_unique_reviewers.show()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "executionInfo": { "elapsed": 55279, "status": "ok", "timestamp": 1592471420646, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "j7akZnCOrzPl", "outputId": "6520c6c5-232d-4c83-e0f2-5d2cc1551dfc" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "283228\n" ] } ], "source": [ "total_unique_reviewers = total_unique_reviewers.head()[\"count_reviewers\"]\n", "print(total_unique_reviewers)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "-9BWtC7psNKr" }, "source": [ "Per calcolare il numero di recensioni per recensore dividiamo il numero totale di recensioni (calcolato per la domanda precedente) per il numero totale di recensori" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "colab_type": "code", "executionInfo": { "elapsed": 33615, "status": "ok", "timestamp": 1592471420647, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "w4ADnD6wrzM_", "outputId": "bf76292b-1caf-498e-dc71-a1270f76abed" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "97.98976089934611\n" ] } ], "source": [ "mean_reviews = total_reviews/total_unique_reviewers\n", "print(mean_reviews)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "33e_xe1FsU9O" }, "source": [ "## Trovare l'utente che ha scritto più recensioni\n", "Per trovare l'utente che ha scritto più recensioni ci basta raggruppare il Dataframe per gli user id, contare il numero di recensioni per utente e poi ordinare in base a questo valore" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "colab_type": "code", "executionInfo": { "elapsed": 33915, "status": "ok", "timestamp": 1592471458422, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "G2YAuIhhrzKh", "outputId": "7ff38323-2e5c-4da1-a228-3ff755d7b963" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+-----+\n", "|userID|count|\n", "+------+-----+\n", "|123100|23715|\n", "|117490| 9279|\n", "|134596| 8381|\n", "|212343| 7884|\n", "|242683| 7515|\n", "+------+-----+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "df.groupBy(\"userID\").count().orderBy(\"count\", ascending=False).show(5)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 1010, "status": "ok", "timestamp": 1592471459435, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "RMADo-eSsUNE" }, "outputs": [], "source": [ "#L'utente 123100 è quello che ha scritto più recenioni in assoluto, con ben 23715 recensioni" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 119 }, "colab_type": "code", "executionInfo": { "elapsed": 32777, "status": "ok", "timestamp": 1592471497765, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "0mjAWCsxsUJy", "outputId": "cbab58eb-99ca-4a83-c627-773038417385" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------------------+\n", "| avg(rating)|\n", "+------------------+\n", "|3.1306346194391734|\n", "+------------------+\n", "\n" ] } ], "source": [ "df.filter(\"userID == '123100'\").agg({\"rating\":\"mean\"}).show()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "fRQ27o8gsln1" }, "source": [ "## Trovare i 10 film che hanno ricevuto più recensioni\n", "Questo è facile, per ottenere i film che hanno avuto più recensioni ci raggruppare per film." ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 655, "status": "ok", "timestamp": 1592471498423, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "NEdAV7vgsUHT" }, "outputs": [], "source": [ "dfMovies = df.groupBy(\"movieID\")" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 290 }, "colab_type": "code", "executionInfo": { "elapsed": 34911, "status": "ok", "timestamp": 1592471540013, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "Dqsi1OWfsUEb", "outputId": "b016f1b7-3716-4ef0-fbeb-263140e1eaa4" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+-----+\n", "|movieID|count|\n", "+-------+-----+\n", "| 318|97999|\n", "| 356|97040|\n", "| 296|92406|\n", "| 593|87899|\n", "| 2571|84545|\n", "| 260|81815|\n", "| 480|76451|\n", "| 527|71516|\n", "| 110|68803|\n", "| 1|68469|\n", "+-------+-----+\n", "only showing top 10 rows\n", "\n" ] } ], "source": [ "dfMovies.count().orderBy(\"count\", ascending=False).show(10)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "NRjzRgaZswiu" }, "source": [ "## Trovare i 10 film con la valutazione più alta\n", "Per prima cosa dobbiamo calcolare sul Datagroup ottenuto appena sopra, la valutazione media e il numero di volte che il film è stato valutato, possiamo farlo in 2 modi." ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "colab_type": "code", "executionInfo": { "elapsed": 35144, "status": "ok", "timestamp": 1592471575693, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "QuT276SPrzIP", "outputId": "eb574465-b70c-48ff-a8b1-3fd9ad9e0d6a" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+------------------+------------+\n", "|movieID| avg_rating|count_rating|\n", "+-------+------------------+------------+\n", "| 296| 4.173971387139363| 92406|\n", "| 1090|3.9017529880478086| 18825|\n", "| 2294|3.2357021735779252| 12974|\n", "| 3210| 3.636775639067115| 9819|\n", "| 48738| 3.849010703859877| 6166|\n", "+-------+------------------+------------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "dfMoviesAvg = dfMovies.agg({\"rating\":\"mean\", \"movieID\":\"count\"}).withColumnRenamed(\"avg(rating)\",\"avg_rating\").withColumnRenamed(\"count(movieID)\",\"count_rating\")\n", "dfMoviesAvg.show(5)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "2dsQhS58s1qx" }, "source": [ "oppure" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "colab_type": "code", "executionInfo": { "elapsed": 58335, "status": "ok", "timestamp": 1592471609990, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "LJX9UnLarybl", "outputId": "a3d7d18f-6ef3-4880-b860-c50b61b345f0" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+------------------+------------+\n", "|movieID| avg_rating|count_rating|\n", "+-------+------------------+------------+\n", "| 296| 4.173971387139363| 92406|\n", "| 1090|3.9017529880478086| 18825|\n", "| 2294|3.2357021735779252| 12974|\n", "| 3210| 3.636775639067115| 9819|\n", "| 48738| 3.849010703859877| 6166|\n", "+-------+------------------+------------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "from pyspark.sql.functions import avg, count\n", "\n", "dfMoviesAvg = dfMovies.agg(avg(\"rating\").alias(\"avg_rating\"), count(\"movieID\").alias(\"count_rating\"))\n", "dfMoviesAvg.show(5)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "hkFbJjmYtA-g" }, "source": [ "Per evitare di trovare in cima film che sono stati recensiti una sola volta con 5 stelle, filtriamo solo i film che hanno ricevuto più di 100 recensioni." ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 627, "status": "ok", "timestamp": 1592471610620, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "2-Sq45MzsuIt" }, "outputs": [], "source": [ "dfMoviesMostRated = dfMoviesAvg.filter(\"count_rating > 100\")" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "3kaDAwNktGe7" }, "source": [ "Ora ci basta ordinare in base alla valutazione media." ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 290 }, "colab_type": "code", "executionInfo": { "elapsed": 36991, "status": "ok", "timestamp": 1592471659316, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "hp_RZV-ysuFP", "outputId": "636fc4cd-5d8d-4dff-e796-aa7b4f551e21" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+------------------+------------+\n", "|movieID| avg_rating|count_rating|\n", "+-------+------------------+------------+\n", "| 171011|4.4865181711606095| 853|\n", "| 159817| 4.458092485549133| 1384|\n", "| 318| 4.424188001918387| 97999|\n", "| 170705| 4.399898373983739| 984|\n", "| 174053| 4.350558659217877| 1074|\n", "| 171495| 4.343949044585988| 157|\n", "| 172591| 4.339667458432304| 421|\n", "| 858| 4.332892749244713| 60904|\n", "| 50| 4.291958829205532| 62180|\n", "| 176601| 4.263888888888889| 180|\n", "+-------+------------------+------------+\n", "only showing top 10 rows\n", "\n" ] } ], "source": [ "dfMoviesTopRated = dfMoviesMostRated.orderBy(\"avg_rating\", ascending=False)\n", "dfMoviesTopRated.show(10)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "G7dgjOTUtMXP" }, "source": [ "## Trovare i 10 film con la valutazione più bassa\n", "Per trovare i 10 film con la valutazione più bassa, ci basta ottenere il dataframe ottenuto sopra, questa volta in maniera ascendente." ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 290 }, "colab_type": "code", "executionInfo": { "elapsed": 44186, "status": "ok", "timestamp": 1592471694835, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "afBv1WKxsuCN", "outputId": "60c0d926-00bb-4508-e47c-62f4699e202a" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+------------------+------------+\n", "|movieID| avg_rating|count_rating|\n", "+-------+------------------+------------+\n", "| 8859|0.8739495798319328| 238|\n", "| 6483|1.0138592750533049| 469|\n", "| 4775| 1.141025641025641| 741|\n", "| 1826|1.2038288288288288| 444|\n", "| 6587|1.2055555555555555| 810|\n", "| 31698|1.2441176470588236| 680|\n", "| 5739|1.2612359550561798| 178|\n", "| 61348|1.2672849915682969| 593|\n", "| 5738|1.3549382716049383| 162|\n", "| 3574|1.3580645161290323| 155|\n", "+-------+------------------+------------+\n", "only showing top 10 rows\n", "\n" ] } ], "source": [ "dfMoviesWorstRated = dfMoviesMostRated.orderBy(\"avg_rating\", ascending=True)\n", "dfMoviesWorstRated.show(10)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "ks_u0zJxtUXg" }, "source": [ "## Trovare le 10 valutazioni più recenti\n", "Questo è semplice, ci basta eseguire l'ordinamento in base al timestamp, spark è in grado di ordinare anche delle date." ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 290 }, "colab_type": "code", "executionInfo": { "elapsed": 80307, "status": "ok", "timestamp": 1592471776907, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "TEXcxroist_D", "outputId": "73633f77-5cef-49f7-94c8-b14329ce7145" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+-------+------+-------------------+\n", "|userID|movieID|rating| timestamp|\n", "+------+-------+------+-------------------+\n", "| 82922| 167780| 4.0|2018-09-26 06:59:09|\n", "| 82922| 53519| 4.0|2018-09-26 06:58:50|\n", "|280481| 494| 3.0|2018-09-26 06:58:47|\n", "|280481| 2355| 3.0|2018-09-26 06:58:43|\n", "|280481| 2294| 2.0|2018-09-26 06:58:41|\n", "|280481| 176101| 3.5|2018-09-26 06:58:30|\n", "|280481| 64614| 3.0|2018-09-26 06:58:22|\n", "| 82922| 165831| 4.0|2018-09-26 06:58:09|\n", "|280481| 1079| 2.5|2018-09-26 06:58:06|\n", "| 82922| 52281| 4.0|2018-09-26 06:58:05|\n", "+------+-------+------+-------------------+\n", "only showing top 10 rows\n", "\n" ] } ], "source": [ "df.orderBy(\"timestamp\", ascending=False).show(10)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "GAXDjVLZtiHI" }, "source": [] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "3GlO4AQHqcAN" }, "source": [ "## Trovare i film più visti per anno\n", "Assumption = solo 1% di chi vede il film recensisce\n", "\n", "Questa domanda è invece più complessa delle altre. Per semplificare le operazioni creiamo una nuova colonna che contiene soltanto l'anno, possiamo estrarre l'anno dalla data usando la funzione *year(timestamp)* di spark." ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "colab_type": "code", "executionInfo": { "elapsed": 23485, "status": "ok", "timestamp": 1592471776908, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "M8JrW5AAtR_l", "outputId": "dd0a9f47-66c0-4129-f4a6-c16ff5c9028e" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+-------+------+----+\n", "|userID|movieID|rating|year|\n", "+------+-------+------+----+\n", "| 1| 307| 3.5|2009|\n", "| 1| 481| 3.5|2009|\n", "| 1| 1091| 1.5|2009|\n", "| 1| 1257| 4.5|2009|\n", "| 1| 1449| 4.5|2009|\n", "+------+-------+------+----+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "from pyspark.sql.functions import year\n", "\n", "dfWithYear = df.withColumn(\"year\", year(df['timestamp'])).drop(\"timestamp\")\n", "dfWithYear.show(5)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "Rr_Ut6oNtrQz" }, "source": [ "Ora raggruppiamo il dataframe sia per anno che per id del film, così facendo avremo dei gruppi caratterizzati dalle combinazioni di queste due colonne, e su questo nuovo dataframe calcoliamo il numero di valutazioni." ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "colab_type": "code", "executionInfo": { "elapsed": 84249, "status": "ok", "timestamp": 1592471853503, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "zG4kH4WMtR9l", "outputId": "409b9de6-8910-4d22-8d57-d9bc7b40d012" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----+-------+------------+\n", "|year|movieID|count_rating|\n", "+----+-------+------------+\n", "|2005| 255| 43|\n", "|2005| 1917| 3460|\n", "|2005| 3793| 4089|\n", "|2005| 5064| 788|\n", "|2005| 6966| 257|\n", "+----+-------+------------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "from pyspark.sql.functions import avg, count\n", "\n", "dfMovieYear = dfWithYear.groupBy(\"year\",\"movieID\").agg(count(\"rating\").alias(\"count_rating\"))\n", "dfMovieYear.show(5)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "JbEoSgNRtvNE" }, "source": [ "Adesso ci dovrebbe bastare raggruppare per anno per poi calcolare le valutazioni massime ricevute e ordinare in base a questo valore." ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 204 }, "colab_type": "code", "executionInfo": { "elapsed": 150371, "status": "ok", "timestamp": 1592471934234, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "zfPcAPoitR8N", "outputId": "314a1bc8-4436-47b2-9b50-e7844c0e3640" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----+------------+\n", "|year|count_rating|\n", "+----+------------+\n", "|1996| 25760|\n", "|2015| 12776|\n", "|1997| 11350|\n", "|2016| 8976|\n", "|2017| 7873|\n", "+----+------------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "from pyspark.sql.functions import max, col\n", "\n", "dfMostRatedYear = dfMovieYear.groupBy(\"year\").agg(max(\"count_rating\").alias(\"count_rating\")).orderBy(\"count_rating\", ascending=False)\n", "dfMostRatedYear.show(5)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "JBfNdpamt4ba" }, "source": [ "ERRORE!!! così facendo abbiamo perso la colonna movie id e, pur conoscendo quante volte è stato valutato il film più valutato dell'anno, non sappiamo quale questo effettivamente sia. Dobbiamo trovare un'altra strada. Una soluzione consiste nel sfruttare una Window insieme alla funzione where." ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 460 }, "colab_type": "code", "executionInfo": { "elapsed": 81335, "status": "ok", "timestamp": 1592472015581, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "zQVD_lXMtR48", "outputId": "49b66ab7-691d-4711-ea3e-920077e94159" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----+-------+------------+\n", "|year|movieID|count_rating|\n", "+----+-------+------------+\n", "|2003| 5952| 3684|\n", "|2007| 2571| 3409|\n", "|2018| 318| 4311|\n", "|2015| 2571| 12776|\n", "|2006| 7153| 4001|\n", "|2013| 318| 2714|\n", "|1997| 780| 11350|\n", "|2014| 318| 2672|\n", "|2004| 7153| 3697|\n", "|1996| 592| 25760|\n", "|1998| 1721| 2399|\n", "|2012| 79132| 2422|\n", "|2009| 58559| 3720|\n", "|2016| 318| 8976|\n", "|1995| 47| 1|\n", "|1995| 21| 1|\n", "|1995| 1176| 1|\n", "|1995| 1079| 1|\n", "|2001| 1210| 4517|\n", "|2005| 5952| 6228|\n", "+----+-------+------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "from pyspark.sql import Window\n", "from pyspark.sql.functions import max, col\n", "\n", "window = Window.partitionBy(\"year\")\n", "\n", "dfMostRatedYear = dfMovieYear.withColumn(\"max\", max(\"count_rating\").over(window)).where(col(\"count_rating\")==col(\"max\")).drop(\"max\")\n", "dfMostRatedYear.show()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "i_sBQLb_uLeT" }, "source": [ "Rimuoviamo il 1995 dal dataframe, dato che non contiene nessuna informazione utile." ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 81332, "status": "ok", "timestamp": 1592472015582, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "Xy7iim4htR2l" }, "outputs": [], "source": [ "dfMostRatedYear = dfMostRatedYear.where(\"year != 1995\")" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "XBsAt4jBuRFn" }, "source": [ "Ora moltiplichiamo il *count_rating* per 100 e, siccome abbiamo affermato che solo l'1% di chi vede un film lo recensisce, otteremo una stima del numero totale di spettatori per film." ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 494 }, "colab_type": "code", "executionInfo": { "elapsed": 196688, "status": "ok", "timestamp": 1592472130941, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "erNPE7lwtR0i", "outputId": "ae97fb32-b086-4dee-813d-f1c814c1732f" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----+-------+-------------+\n", "|year|movieID|total_viewers|\n", "+----+-------+-------------+\n", "|2003| 5952| 368400|\n", "|2007| 2571| 340900|\n", "|2018| 318| 431100|\n", "|2015| 2571| 1277600|\n", "|2006| 7153| 400100|\n", "|2013| 318| 271400|\n", "|1997| 780| 1135000|\n", "|2014| 318| 267200|\n", "|2004| 7153| 369700|\n", "|1996| 592| 2576000|\n", "|1998| 1721| 239900|\n", "|2012| 79132| 242200|\n", "|2009| 58559| 372000|\n", "|2016| 318| 897600|\n", "|2001| 1210| 451700|\n", "|2005| 5952| 622800|\n", "|2000| 1210| 745200|\n", "|2010| 72998| 388200|\n", "|2011| 79132| 328000|\n", "|2008| 2571| 420700|\n", "|2017| 318| 787300|\n", "|1999| 2396| 462100|\n", "|2002| 4993| 355100|\n", "+----+-------+-------------+\n", "\n" ] } ], "source": [ "dfMostViewedYear = dfMostRatedYear.withColumn(\"total_viewers\",dfMostRatedYear[\"count_rating\"]*100).drop(\"count_rating\")\n", "dfMostViewedYear.show(27)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "8VmUNQNSudNU" }, "source": [ "## Aggiungiamo titolo e genere alla lista dei film più visti per anno\n", "Abbiamo già detto che all'interno del file movies.csv si trovano titolo e genere dei film. Carichiamo tale film all'interno di un nuovo Dataframe" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 290 }, "colab_type": "code", "executionInfo": { "elapsed": 132257, "status": "ok", "timestamp": 1592472131679, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "beQ0J84VtRyC", "outputId": "959ecca8-d1a1-45f3-af5a-8238a555bf31" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+--------------------+--------------------+\n", "|movieId| title| genres|\n", "+-------+--------------------+--------------------+\n", "| 1| Toy Story (1995)|Adventure|Animati...|\n", "| 2| Jumanji (1995)|Adventure|Childre...|\n", "| 3|Grumpier Old Men ...| Comedy|Romance|\n", "| 4|Waiting to Exhale...|Comedy|Drama|Romance|\n", "| 5|Father of the Bri...| Comedy|\n", "| 6| Heat (1995)|Action|Crime|Thri...|\n", "| 7| Sabrina (1995)| Comedy|Romance|\n", "| 8| Tom and Huck (1995)| Adventure|Children|\n", "| 9| Sudden Death (1995)| Action|\n", "| 10| GoldenEye (1995)|Action|Adventure|...|\n", "+-------+--------------------+--------------------+\n", "only showing top 10 rows\n", "\n" ] } ], "source": [ "df_desc = spark.read.csv(\"ml-latest/movies.csv\", header=True, inferSchema=True)\n", "df_desc.show(10)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "eQjA2LEKunVw" }, "source": [ "Per aggiungere quelle informazioni al dataframe con i film più visti per anno ci basta eseguire un join lunga la colonna movieID, presente in entrambi i Dataframe." ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 514 }, "colab_type": "code", "executionInfo": { "elapsed": 230464, "status": "ok", "timestamp": 1592472248081, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "N6W1Y-f5st8f", "outputId": "cda453e0-52c5-4fa5-9477-ff1d16348520" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+----+-------------+---------------------------------------------------------+-----------------------------------------------+\n", "|movieID|year|total_viewers|title |genres |\n", "+-------+----+-------------+---------------------------------------------------------+-----------------------------------------------+\n", "|5952 |2003|368400 |Lord of the Rings: The Two Towers, The (2002) |Adventure|Fantasy |\n", "|2571 |2007|340900 |Matrix, The (1999) |Action|Sci-Fi|Thriller |\n", "|318 |2018|431100 |Shawshank Redemption, The (1994) |Crime|Drama |\n", "|2571 |2015|1277600 |Matrix, The (1999) |Action|Sci-Fi|Thriller |\n", "|7153 |2006|400100 |Lord of the Rings: The Return of the King, The (2003) |Action|Adventure|Drama|Fantasy |\n", "|318 |2013|271400 |Shawshank Redemption, The (1994) |Crime|Drama |\n", "|780 |1997|1135000 |Independence Day (a.k.a. ID4) (1996) |Action|Adventure|Sci-Fi|Thriller |\n", "|318 |2014|267200 |Shawshank Redemption, The (1994) |Crime|Drama |\n", "|7153 |2004|369700 |Lord of the Rings: The Return of the King, The (2003) |Action|Adventure|Drama|Fantasy |\n", "|592 |1996|2576000 |Batman (1989) |Action|Crime|Thriller |\n", "|1721 |1998|239900 |Titanic (1997) |Drama|Romance |\n", "|79132 |2012|242200 |Inception (2010) |Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX|\n", "|58559 |2009|372000 |Dark Knight, The (2008) |Action|Crime|Drama|IMAX |\n", "|318 |2016|897600 |Shawshank Redemption, The (1994) |Crime|Drama |\n", "|1210 |2001|451700 |Star Wars: Episode VI - Return of the Jedi (1983) |Action|Adventure|Sci-Fi |\n", "|5952 |2005|622800 |Lord of the Rings: The Two Towers, The (2002) |Adventure|Fantasy |\n", "|1210 |2000|745200 |Star Wars: Episode VI - Return of the Jedi (1983) |Action|Adventure|Sci-Fi |\n", "|72998 |2010|388200 |Avatar (2009) |Action|Adventure|Sci-Fi|IMAX |\n", "|79132 |2011|328000 |Inception (2010) |Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX|\n", "|2571 |2008|420700 |Matrix, The (1999) |Action|Sci-Fi|Thriller |\n", "|318 |2017|787300 |Shawshank Redemption, The (1994) |Crime|Drama |\n", "|2396 |1999|462100 |Shakespeare in Love (1998) |Comedy|Drama|Romance |\n", "|4993 |2002|355100 |Lord of the Rings: The Fellowship of the Ring, The (2001)|Adventure|Fantasy |\n", "+-------+----+-------------+---------------------------------------------------------+-----------------------------------------------+\n", "\n" ] } ], "source": [ "dfMostViewedYear = dfMostViewedYear.join(df_desc, [\"movieId\"])\n", "\n", "# impostando il secondo parametro del metodo .show() a False\n", "# possiamo visualizzare tutto il Dataframe in ampiezza\n", "\n", "dfMostViewedYear.show(27, False)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 118421, "status": "ok", "timestamp": 1592472366516, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "xmHYy_UQst54" }, "outputs": [], "source": [ "# dfMostViewedYear.toPandas().to_csv(\"top_movie_by_year.csv\", header=True)\n", "dfMostViewedYear.write.csv(\"top_movie_by_year\")" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "GHAoKPS6u4SS" }, "source": [ "Data la natura parallela di spark, le righe del Dataframe non verranno salvate all'interno di un unico file, ma verrà creato un file per ogni riga. Possiamo unire i file in un singolo file csv utilizzando il comando cat da terminale, infine utilizziamo il comando rm per rimuovere la cartella con i vari file creata da spark." ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 121183, "status": "ok", "timestamp": 1592472369284, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "Q5BNfuezuvoE" }, "outputs": [], "source": [ "!cat top_movie_by_year/part* > top_movie_by_year.csv\n", "!rm -r top_movie_by_year" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "MAcDh9Yku9bQ" }, "source": [ "## SQL\n", "Se sei appassionato di SQL, questa è la query che ci permette di trovare i film più valutati per anno." ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 599, "status": "ok", "timestamp": 1592472404381, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "J7V8m17Ku5_I" }, "outputs": [], "source": [ "dfWithYear.registerTempTable('movies')" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "colab": {}, "colab_type": "code", "executionInfo": { "elapsed": 907, "status": "ok", "timestamp": 1592472405226, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "LJ6157tyu6wU" }, "outputs": [], "source": [ "new_df = spark.sql(\"\"\"\n", "SELECT movieID, YEAR, VALUT FROM (\n", "SELECT T.*, RANK() OVER(PARTITION BY YEAR ORDER BY VALUT DESC) RNK FROM (\n", "SELECT movieID, YEAR, COUNT(*) VALUT FROM movies\n", "GROUP BY movieID, YEAR) T)\n", "WHERE RNK = 1\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 460 }, "colab_type": "code", "executionInfo": { "elapsed": 84050, "status": "ok", "timestamp": 1592472489580, "user": { "displayName": "T3Lab Vision", "photoUrl": "", "userId": "14779383426442114373" }, "user_tz": -120 }, "id": "sTMN9rcwu6rz", "outputId": "fba27cc9-471d-4ea8-bcda-53412c023518" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+----+-----+\n", "|movieID|YEAR|VALUT|\n", "+-------+----+-----+\n", "| 5952|2003| 3684|\n", "| 2571|2007| 3409|\n", "| 318|2018| 4311|\n", "| 2571|2015|12776|\n", "| 7153|2006| 4001|\n", "| 318|2013| 2714|\n", "| 780|1997|11350|\n", "| 318|2014| 2672|\n", "| 7153|2004| 3697|\n", "| 592|1996|25760|\n", "| 1721|1998| 2399|\n", "| 79132|2012| 2422|\n", "| 58559|2009| 3720|\n", "| 318|2016| 8976|\n", "| 1079|1995| 1|\n", "| 21|1995| 1|\n", "| 47|1995| 1|\n", "| 1176|1995| 1|\n", "| 1210|2001| 4517|\n", "| 5952|2005| 6228|\n", "+-------+----+-----+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "new_df.show(20)" ] } ], "metadata": { "colab": { "name": "movies_rev.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 1 }