Open In Colab

Regressione: Come Scegliere il modello

Questo notebook fornisce un esempio che permette di comparare il comportamento di diversi regressori e scegliere il migliore. Al fine di fare questa scelta viene utilizzata la cross-validation che permette di capire quanto il regressore è robusto a variazioni nel train-test set.

  1. Caricare il dataset (di regressione).

  2. Comparare i modelli

  3. Scegliere il modello migliore

  4. Applicare il modello allenato a un nuovo set di dati

Importare le librerie

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error  # MSE
from sklearn.metrics import mean_absolute_error # MAE
from sklearn.metrics import median_absolute_error # MedAE
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RANSACRegressor, SGDRegressor, HuberRegressor, TheilSenRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
import seaborn as sn
import matplotlib.pyplot as plt
import random
import numpy as np
import plotly.graph_objects as go
import pickle
import json
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn import datasets

1. Caricare il dataset

Assumiamo che l’ultima colonna del dataset sia il valore di target e che il dataset contenga come input solo valori numerici.

def load_dataset(name="boston"):
    datasets_list = ["boston", "diabetes"] #,"tips"]
    chosen_dataset = name #"boston"

    boston = datasets.load_boston()
    df_boston = pd.DataFrame(boston.data,columns=boston.feature_names)
    df_boston["price"] = boston.target

    diabetes = datasets.load_diabetes()
    df_diabetes = pd.DataFrame(diabetes.data,columns=diabetes.feature_names)
    df_diabetes["desease"] = diabetes.target
    df_diabetes = df_diabetes.round(decimals=5)

    #df_nuovo_dataset = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSqHhx2kS9gCNmI04yksqTP2PRsT6ifTU2DLokKs3Y6KgcSGIAL7_4t_q_8kNhVkFA0xD2nt7hn_w-5/pub?output=csv")

    dict_datasets = {
        "boston": df_boston,
        "diabetes": df_diabetes,
    #    "nuovo" : df_nuovo_dataset
    }    
    df = dict_datasets[chosen_dataset]#.head(10)
    df = df.dropna()

    #print(df.head())
    X = df.iloc[:,0:-1].values  # Input: dalla prima alla penultima colonna
    Y = df.iloc[:,-1].values    # Target: utlima colonna

    return X,Y,df
X,Y,df = load_dataset(name="boston")
display(df.head())
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT price
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33 36.2

2. Comparare i modelli

def validate(Y_test,Y_pred,name):
    mse = mean_squared_error(Y_test,Y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(Y_test,Y_pred)
    medae = median_absolute_error(Y_test,Y_pred)
    print("[" + name + "]" + " MSE: ", round(mse,4), "RMSE  : ", round(rmse,4), "MAE: ", round(mae,4), "MedAE: ", round(medae,4))

def compare_models(X,Y):
    # Split data into training and validation set
    #X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.01,  shuffle=True, random_state=0) 
    #print("Shapes: X_train: ", X_train.shape, "Y_train: ", Y_train.shape, "X_test: ", X_test.shape, "Y_test", Y_test.shape)
    #print("Metric : negative mean square error (MSE)")

    # Scaling
    sc = StandardScaler()
    sc.fit(X)
    X_train = sc.transform(X)
    Y_train = Y
    #X_test = sc.transform(X_test)

    # PCA
    '''
    pc = PCA(n_components=0.95)
    pc.fit(X_train)
    X_train = pc.transform(X_train)
    X_test = pc.transform(X_test)
    print (pc.explained_variance_)
    print (pc.explained_variance_ratio_)
    '''
    # Polinomial degree
    '''
    poly = PolynomialFeatures(degree=2)
    poly.fit(X_train)
    X_train = poly.transform(X_train)
    X_test = poly.transform(X_test)
    '''

    # user variables to tune
    seed    = 5
    folds   = 20
    metric  = "neg_mean_squared_error"

    # hold different regression models in a single dictionary
    models = {}
    models["Linear"]        = LinearRegression()
    #models["RANSAC"]        = RANSACRegressor()
    models["Huber"]         = HuberRegressor(max_iter=1000)
    models["TheilSen"]      = TheilSenRegressor()
    #models["SGD"]           = SGDRegressor(max_iter=500,penalty=None, eta0=0.01, tol=0.00001)
    models["Ridge"]         = Ridge()
    models["Lasso"]         = Lasso()
    models["ElasticNet"]    = ElasticNet()
    models["KNN"]           = KNeighborsRegressor(n_neighbors=5)
    models["DecisionTree"]  = DecisionTreeRegressor()
    models["SVR"]           = SVR(gamma="auto")
    models["AdaBoost"]      = AdaBoostRegressor(n_estimators=50)
    models["GradientBoost"] = GradientBoostingRegressor(n_estimators=100)
    models["RandomForest"]  = RandomForestRegressor(n_estimators=100)
    models["ExtraTrees"]    = ExtraTreesRegressor(n_estimators=100)

    # 10-fold cross validation for each model
    model_results = []
    model_names   = []
    for model_name in models:
        model   = models[model_name]
        k_fold  = KFold(n_splits=folds, random_state=seed,shuffle=True)
        results = cross_val_score(model, X_train, Y_train, cv=k_fold, scoring=metric)

        model_results.append(results)
        model_names.append(model_name)
        print("{}: {}, {}".format(model_name, round(results.mean(), 3), round(results.std(), 3)))

    fig = go.Figure()
    for name,res in zip(model_names,model_results):    
        fig.add_trace(go.Box(y=res,name=name, boxpoints='all'))
    #fig.show()
    return fig
fig_compare_models = compare_models(X,Y)
fig_compare_models.show()
Linear: -23.533, 10.945
Huber: -25.031, 14.541
TheilSen: -37.511, 28.329
Ridge: -23.524, 10.984
Lasso: -29.188, 14.094
ElasticNet: -30.867, 16.12
KNN: -20.13, 15.229
DecisionTree: -23.841, 20.706
SVR: -27.564, 19.139
AdaBoost: -14.025, 6.407
GradientBoost: -8.321, 4.745
RandomForest: -10.23, 6.179
ExtraTrees: -9.024, 5.04

3. Scegliere il modello migliore

def plot_fig(Ys, names):
    # Ys list of output to plot [Y_real, Y_pred]
    n = np.linspace(0,len(Ys[0]), len(Ys[0]), dtype=int)
    fig = go.Figure()
    for yh,nm in zip(Ys,names):
        fig.add_trace(go.Scatter(x=n, y=yh,
                      mode='lines',#mode='lines+markers',
                      name=nm))
    fig.update_layout(
      hovermode = "x",
      paper_bgcolor = "rgb(0,0,0)" ,
      plot_bgcolor = "rgb(10,10,10)" , 
      title=dict(
          x = 0.5,
          text = "Risultati",
          font=dict(
              size = 20,
              color = "rgb(255,255,255)"
          )
      )
    )
    return fig

def train(X,Y,selected="Linear", modelName='best_model.sav'):
    max_val = -10000000
    n_iter = 20
    R2_train_max = 0
    R2_test_max = 0
    for i in range(0,n_iter):
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.10,  shuffle=True) 

        # Scaling
        #sc = StandardScaler()
        #sc.fit(X_train)
        #X_train = sc.transform(X_train)
        #X_test = sc.transform(X_test)

        # create and fit the best regression model
        seed =5
        models = {}
        models["Linear"]        = LinearRegression()
        #models["RANSAC"]        = RANSACRegressor()
        models["Huber"]         = HuberRegressor(max_iter=1000)
        models["TheilSen"]      = TheilSenRegressor()
        #models["SGD"]           = SGDRegressor(max_iter=500,penalty=None, eta0=0.01, tol=0.00001)
        models["Ridge"]         = Ridge()
        models["Lasso"]         = Lasso()
        models["ElasticNet"]    = ElasticNet()
        models["KNN"]           = KNeighborsRegressor()
        models["DecisionTree"]  = DecisionTreeRegressor()
        models["SVR"]           = SVR()
        models["AdaBoost"]      = AdaBoostRegressor()
        models["GradientBoost"] = GradientBoostingRegressor()
        models["RandomForest"]  = RandomForestRegressor()
        models["ExtraTrees"]    = ExtraTreesRegressor()
        
        best_model = models[selected]

        # Logistic Regression
        pipeline = Pipeline([
            ("sc", StandardScaler()),
            #("pca", PCA(n_components=0.98)),
            ("reg", best_model),
        ])
        pipeline.fit(X_train, Y_train)
        
        #best_model.fit(X_train, Y_train)

        # make predictions using the model (train and test)
        Y_test_pred = pipeline.predict(X_test)
        Y_train_pred = pipeline.predict(X_train)
        #print("[INFO] MSE : {}".format(round(mean_squared_error(Y_test, Y_test_pred), 3)))

        # R2 score coefficient of determination (quanto gli input influscono sulla predizione)
        # 0 male 1 bene
        #validate(Y_train,Y_train_pred,name="Training")
        R2_train = pipeline.score(X_train, Y_train)
        #print("[Training] R2 Score: ", round(R2_train,3))

        #validate(Y_test,Y_test_pred,name="Test")
        R2_test = pipeline.score(X_test, Y_test)
        #print("[Test] R2 Score: ", round(R2_test,3))

        if np.abs(R2_test)>max_val:
            # Save model
            R2_train_max = R2_train
            R2_test_max = R2_test
            pickle.dump(pipeline, open(modelName, 'wb'))
            max_val = np.abs(R2_test)
            fig_train = plot_fig([Y_train,Y_train_pred],["Train Real", "Train Predicted"])
            fig_test = plot_fig([Y_test,Y_test_pred],["Test Real","Test Predicted"])
    
    print( "Best: [Training] R2 Score: ", round(R2_train_max,3))
    print("Best: [Test] R2 Score: ", round(R2_test_max,3))
    return fig_train,fig_test
model_list = ["Linear", "Huber", "TheilSen","Ridge","Lasso","ElasticNet","KNN","DecisionTree","SVR","AdaBoost","GradientBoost","RandomForest","ExtraTrees"]
chosen_model = "GradientBoost"
fig_train, fig_test = train(X,Y,selected=chosen_model,modelName="test.sav")
fig_train.show()
fig_test.show()
Best: [Training] R2 Score:  0.976
Best: [Test] R2 Score:  0.947

4. Applicare il modello allenato a un nuovo set di dati

def apply_model(X,modelName='best_model.sav'):
    loaded_model = pickle.load(open(modelName, 'rb'))
    y_hat = loaded_model.predict(X)
    return y_hat

Per questa parte fingeremo di avere nuovi dati. Questi ultimi li creeremo utilizzando la funzione train_test_split di scikit-learn.

# Creiamo dei dati
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.80,  shuffle=True) 

Carichiamo la pipeline componente per componente per componente

loaded_model = pickle.load(open("test.sav", 'rb'))
sc_block = loaded_model.named_steps["sc"]
loaded_model = pickle.load(open("test.sav", 'rb'))

# Applicare lo Standard Scaling
sc_block = loaded_model.named_steps["sc"]
X_test_sc = sc_block.transform(X_test)

# Applicare la PCA
#pca_block = loaded_model.named_steps["pca"]
#X_test_pca = pca_block.transform(X_test_sc)
#print(pca_block.explained_variance_ratio_)

# Applicare il regressore
model_trained = loaded_model.named_steps["reg"]
Y_pred = model_trained.predict(X_test_sc)

Carichiamo l’intera pipeline

Y_pred_pipe = apply_model(X_test,modelName="test.sav")

Assicuriamoci che i due risultati siano uguali

fig_res = plot_fig([Y_test, Y_pred,Y_pred_pipe], ["test", "pred","pipe"])
fig_res.show()

Extra: Train-Test procedure

# Procedura di training
def train_procedure():
    # 0 CARICA DATASET
    print("CARICA DATASET")
    X,Y,df = load_dataset(name="boston")

    # 1 COMPARA I MODELLI
    print("COMPARA MODELLI")
    fig_compare_models = compare_models(X,Y)

    # 2 SCELGO IL MODELLO MIGLIORE
    print("SCELGO MODELLO MIGLIORE")
    model_list = ["Linear", "Huber", "TheilSen","Ridge","Lasso","ElasticNet","KNN","DecisionTree","SVR","AdaBoost","GradientBoost","RandomForest","ExtraTrees"]
    chosen_model = "GradientBoost"
    model_name = "test.sav"
    fig_train, fig_test = train(X,Y,selected=chosen_model,modelName=model_name)

# Procedura di test
def test_procedure():
    # 3 CARICO IL NUOVO DATASET
    print("CARICO NUOVO DATASET")
    X,Y,df = load_dataset(name="boston")
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.80,  shuffle=True) 

    # 3bis. APPLICARE IL MODELL ALLENATO
    print("APPLICO MODELLO ALLENATO")
    Y_pred_pipe = apply_model(X_test,modelName="test.sav")
    fig_res = plot_fig([Y_test,Y_pred_pipe], ["test","pipe"])
#train_procedure()
test_procedure()