Regressione: Come Scegliere il modello¶
Questo notebook fornisce un esempio che permette di comparare il comportamento di diversi regressori e scegliere il migliore. Al fine di fare questa scelta viene utilizzata la cross-validation che permette di capire quanto il regressore è robusto a variazioni nel train-test set.
Caricare il dataset (di regressione).
Comparare i modelli
Scegliere il modello migliore
Applicare il modello allenato a un nuovo set di dati
Importare le librerie¶
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error # MSE
from sklearn.metrics import mean_absolute_error # MAE
from sklearn.metrics import median_absolute_error # MedAE
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RANSACRegressor, SGDRegressor, HuberRegressor, TheilSenRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
import seaborn as sn
import matplotlib.pyplot as plt
import random
import numpy as np
import plotly.graph_objects as go
import pickle
import json
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn import datasets
1. Caricare il dataset¶
Assumiamo che l’ultima colonna del dataset sia il valore di target e che il dataset contenga come input solo valori numerici.
def load_dataset(name="boston"):
datasets_list = ["boston", "diabetes"] #,"tips"]
chosen_dataset = name #"boston"
boston = datasets.load_boston()
df_boston = pd.DataFrame(boston.data,columns=boston.feature_names)
df_boston["price"] = boston.target
diabetes = datasets.load_diabetes()
df_diabetes = pd.DataFrame(diabetes.data,columns=diabetes.feature_names)
df_diabetes["desease"] = diabetes.target
df_diabetes = df_diabetes.round(decimals=5)
#df_nuovo_dataset = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSqHhx2kS9gCNmI04yksqTP2PRsT6ifTU2DLokKs3Y6KgcSGIAL7_4t_q_8kNhVkFA0xD2nt7hn_w-5/pub?output=csv")
dict_datasets = {
"boston": df_boston,
"diabetes": df_diabetes,
# "nuovo" : df_nuovo_dataset
}
df = dict_datasets[chosen_dataset]#.head(10)
df = df.dropna()
#print(df.head())
X = df.iloc[:,0:-1].values # Input: dalla prima alla penultima colonna
Y = df.iloc[:,-1].values # Target: utlima colonna
return X,Y,df
X,Y,df = load_dataset(name="boston")
display(df.head())
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18.0 | 2.31 | 0.0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1.0 | 296.0 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0.0 | 7.07 | 0.0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2.0 | 242.0 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0.0 | 7.07 | 0.0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2.0 | 242.0 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 0.03237 | 0.0 | 2.18 | 0.0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3.0 | 222.0 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 0.06905 | 0.0 | 2.18 | 0.0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3.0 | 222.0 | 18.7 | 396.90 | 5.33 | 36.2 |
2. Comparare i modelli¶
def validate(Y_test,Y_pred,name):
mse = mean_squared_error(Y_test,Y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_test,Y_pred)
medae = median_absolute_error(Y_test,Y_pred)
print("[" + name + "]" + " MSE: ", round(mse,4), "RMSE : ", round(rmse,4), "MAE: ", round(mae,4), "MedAE: ", round(medae,4))
def compare_models(X,Y):
# Split data into training and validation set
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.01, shuffle=True, random_state=0)
#print("Shapes: X_train: ", X_train.shape, "Y_train: ", Y_train.shape, "X_test: ", X_test.shape, "Y_test", Y_test.shape)
#print("Metric : negative mean square error (MSE)")
# Scaling
sc = StandardScaler()
sc.fit(X)
X_train = sc.transform(X)
Y_train = Y
#X_test = sc.transform(X_test)
# PCA
'''
pc = PCA(n_components=0.95)
pc.fit(X_train)
X_train = pc.transform(X_train)
X_test = pc.transform(X_test)
print (pc.explained_variance_)
print (pc.explained_variance_ratio_)
'''
# Polinomial degree
'''
poly = PolynomialFeatures(degree=2)
poly.fit(X_train)
X_train = poly.transform(X_train)
X_test = poly.transform(X_test)
'''
# user variables to tune
seed = 5
folds = 20
metric = "neg_mean_squared_error"
# hold different regression models in a single dictionary
models = {}
models["Linear"] = LinearRegression()
#models["RANSAC"] = RANSACRegressor()
models["Huber"] = HuberRegressor(max_iter=1000)
models["TheilSen"] = TheilSenRegressor()
#models["SGD"] = SGDRegressor(max_iter=500,penalty=None, eta0=0.01, tol=0.00001)
models["Ridge"] = Ridge()
models["Lasso"] = Lasso()
models["ElasticNet"] = ElasticNet()
models["KNN"] = KNeighborsRegressor(n_neighbors=5)
models["DecisionTree"] = DecisionTreeRegressor()
models["SVR"] = SVR(gamma="auto")
models["AdaBoost"] = AdaBoostRegressor(n_estimators=50)
models["GradientBoost"] = GradientBoostingRegressor(n_estimators=100)
models["RandomForest"] = RandomForestRegressor(n_estimators=100)
models["ExtraTrees"] = ExtraTreesRegressor(n_estimators=100)
# 10-fold cross validation for each model
model_results = []
model_names = []
for model_name in models:
model = models[model_name]
k_fold = KFold(n_splits=folds, random_state=seed,shuffle=True)
results = cross_val_score(model, X_train, Y_train, cv=k_fold, scoring=metric)
model_results.append(results)
model_names.append(model_name)
print("{}: {}, {}".format(model_name, round(results.mean(), 3), round(results.std(), 3)))
fig = go.Figure()
for name,res in zip(model_names,model_results):
fig.add_trace(go.Box(y=res,name=name, boxpoints='all'))
#fig.show()
return fig
fig_compare_models = compare_models(X,Y)
fig_compare_models.show()
Linear: -23.533, 10.945
Huber: -25.031, 14.541
TheilSen: -37.511, 28.329
Ridge: -23.524, 10.984
Lasso: -29.188, 14.094
ElasticNet: -30.867, 16.12
KNN: -20.13, 15.229
DecisionTree: -23.841, 20.706
SVR: -27.564, 19.139
AdaBoost: -14.025, 6.407
GradientBoost: -8.321, 4.745
RandomForest: -10.23, 6.179
ExtraTrees: -9.024, 5.04
3. Scegliere il modello migliore¶
def plot_fig(Ys, names):
# Ys list of output to plot [Y_real, Y_pred]
n = np.linspace(0,len(Ys[0]), len(Ys[0]), dtype=int)
fig = go.Figure()
for yh,nm in zip(Ys,names):
fig.add_trace(go.Scatter(x=n, y=yh,
mode='lines',#mode='lines+markers',
name=nm))
fig.update_layout(
hovermode = "x",
paper_bgcolor = "rgb(0,0,0)" ,
plot_bgcolor = "rgb(10,10,10)" ,
title=dict(
x = 0.5,
text = "Risultati",
font=dict(
size = 20,
color = "rgb(255,255,255)"
)
)
)
return fig
def train(X,Y,selected="Linear", modelName='best_model.sav'):
max_val = -10000000
n_iter = 20
R2_train_max = 0
R2_test_max = 0
for i in range(0,n_iter):
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.10, shuffle=True)
# Scaling
#sc = StandardScaler()
#sc.fit(X_train)
#X_train = sc.transform(X_train)
#X_test = sc.transform(X_test)
# create and fit the best regression model
seed =5
models = {}
models["Linear"] = LinearRegression()
#models["RANSAC"] = RANSACRegressor()
models["Huber"] = HuberRegressor(max_iter=1000)
models["TheilSen"] = TheilSenRegressor()
#models["SGD"] = SGDRegressor(max_iter=500,penalty=None, eta0=0.01, tol=0.00001)
models["Ridge"] = Ridge()
models["Lasso"] = Lasso()
models["ElasticNet"] = ElasticNet()
models["KNN"] = KNeighborsRegressor()
models["DecisionTree"] = DecisionTreeRegressor()
models["SVR"] = SVR()
models["AdaBoost"] = AdaBoostRegressor()
models["GradientBoost"] = GradientBoostingRegressor()
models["RandomForest"] = RandomForestRegressor()
models["ExtraTrees"] = ExtraTreesRegressor()
best_model = models[selected]
# Logistic Regression
pipeline = Pipeline([
("sc", StandardScaler()),
#("pca", PCA(n_components=0.98)),
("reg", best_model),
])
pipeline.fit(X_train, Y_train)
#best_model.fit(X_train, Y_train)
# make predictions using the model (train and test)
Y_test_pred = pipeline.predict(X_test)
Y_train_pred = pipeline.predict(X_train)
#print("[INFO] MSE : {}".format(round(mean_squared_error(Y_test, Y_test_pred), 3)))
# R2 score coefficient of determination (quanto gli input influscono sulla predizione)
# 0 male 1 bene
#validate(Y_train,Y_train_pred,name="Training")
R2_train = pipeline.score(X_train, Y_train)
#print("[Training] R2 Score: ", round(R2_train,3))
#validate(Y_test,Y_test_pred,name="Test")
R2_test = pipeline.score(X_test, Y_test)
#print("[Test] R2 Score: ", round(R2_test,3))
if np.abs(R2_test)>max_val:
# Save model
R2_train_max = R2_train
R2_test_max = R2_test
pickle.dump(pipeline, open(modelName, 'wb'))
max_val = np.abs(R2_test)
fig_train = plot_fig([Y_train,Y_train_pred],["Train Real", "Train Predicted"])
fig_test = plot_fig([Y_test,Y_test_pred],["Test Real","Test Predicted"])
print( "Best: [Training] R2 Score: ", round(R2_train_max,3))
print("Best: [Test] R2 Score: ", round(R2_test_max,3))
return fig_train,fig_test
model_list = ["Linear", "Huber", "TheilSen","Ridge","Lasso","ElasticNet","KNN","DecisionTree","SVR","AdaBoost","GradientBoost","RandomForest","ExtraTrees"]
chosen_model = "GradientBoost"
fig_train, fig_test = train(X,Y,selected=chosen_model,modelName="test.sav")
fig_train.show()
fig_test.show()
Best: [Training] R2 Score: 0.976
Best: [Test] R2 Score: 0.947
4. Applicare il modello allenato a un nuovo set di dati¶
def apply_model(X,modelName='best_model.sav'):
loaded_model = pickle.load(open(modelName, 'rb'))
y_hat = loaded_model.predict(X)
return y_hat
Per questa parte fingeremo di avere nuovi dati. Questi ultimi li creeremo utilizzando la funzione train_test_split di scikit-learn.
# Creiamo dei dati
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.80, shuffle=True)
Carichiamo la pipeline componente per componente per componente
loaded_model = pickle.load(open("test.sav", 'rb'))
sc_block = loaded_model.named_steps["sc"]
loaded_model = pickle.load(open("test.sav", 'rb'))
# Applicare lo Standard Scaling
sc_block = loaded_model.named_steps["sc"]
X_test_sc = sc_block.transform(X_test)
# Applicare la PCA
#pca_block = loaded_model.named_steps["pca"]
#X_test_pca = pca_block.transform(X_test_sc)
#print(pca_block.explained_variance_ratio_)
# Applicare il regressore
model_trained = loaded_model.named_steps["reg"]
Y_pred = model_trained.predict(X_test_sc)
Carichiamo l’intera pipeline
Y_pred_pipe = apply_model(X_test,modelName="test.sav")
Assicuriamoci che i due risultati siano uguali
fig_res = plot_fig([Y_test, Y_pred,Y_pred_pipe], ["test", "pred","pipe"])
fig_res.show()
Extra: Train-Test procedure¶
# Procedura di training
def train_procedure():
# 0 CARICA DATASET
print("CARICA DATASET")
X,Y,df = load_dataset(name="boston")
# 1 COMPARA I MODELLI
print("COMPARA MODELLI")
fig_compare_models = compare_models(X,Y)
# 2 SCELGO IL MODELLO MIGLIORE
print("SCELGO MODELLO MIGLIORE")
model_list = ["Linear", "Huber", "TheilSen","Ridge","Lasso","ElasticNet","KNN","DecisionTree","SVR","AdaBoost","GradientBoost","RandomForest","ExtraTrees"]
chosen_model = "GradientBoost"
model_name = "test.sav"
fig_train, fig_test = train(X,Y,selected=chosen_model,modelName=model_name)
# Procedura di test
def test_procedure():
# 3 CARICO IL NUOVO DATASET
print("CARICO NUOVO DATASET")
X,Y,df = load_dataset(name="boston")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.80, shuffle=True)
# 3bis. APPLICARE IL MODELL ALLENATO
print("APPLICO MODELLO ALLENATO")
Y_pred_pipe = apply_model(X_test,modelName="test.sav")
fig_res = plot_fig([Y_test,Y_pred_pipe], ["test","pipe"])
#train_procedure()
test_procedure()