Open In Colab

Regressione Custom Data

from google.colab import files
uploaded = files.upload()
!ls
dati.xlsx  drive  sample_data
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
data = pd.read_excel('dati.xlsx')
data.head()
var1 var2 var6 var7 var8 var9 var10 var11 var12 var13 var14 var15 var16 var17 var18 var19 var20 var21 var22 var23 var27 var28 var29 var30 var31 var32 var33 var34 var35 var39 var40 var41 var42 var43 var44 var45 var46 var47 var48 var49 ... var108 var109 var110 var111 var112 var113 var114 var115 var116 var123 var124 var125 var126 var127 var128 var129 var130 var131 var132 var133 var134 var135 var136 var137 var141 var142 var143 var147 var148 var149 var150 var151 var152 var153 var154 var155 var156 var157 var158 costi
0 147823139 148193655 51344312 61401789 71387976 23064675 20882011 14244395 2794279 2677799 2833077 14562964 14352577 8073273 0 0 0 0 0 73273 9312470 7915379 7272270 35436524 37191289 32749686 32719851 34723462 30111298 84978 318158 318158 0 0 0 7834908 2178851 18788146 290899 188430 ... 13310780 13264428 12114005 14296757 13558421 13421133 -823170 -507756 -530593 1228034 977093 1122097 783131 572136 267603 1588604 1604455 1856974 36310 -66672 135970 0 0 0 1870101 3257494 4594656 -3027241 1534078 3813249 229 214 207 84978 318158 318158 25290616 24108358 24535319 144205134
1 147562322 163251010 1037776 1391301 1602930 23503659 22973259 20946238 13368527 12404318 11085331 1850794 1693511 1060485 364872 1287356 811956 1485922 406155 248529 34529004 35683884 35340902 23391321 22915530 18886264 10795833 11017483 11177585 1583 1583 1588 355977 43180 471450 5293624 2889766 1616895 408806 551290 ... 16889643 15833127 15986500 2713441 2434423 2180861 3207072 1304536 -3015558 912728 1042592 1058524 80006 47390 35171 769967 895916 1201286 11943 82820 1553908 0 0 0 1373420 1220932 1304576 1924469 2124721 2204557 454 411 456 1583 1583 1588 14133394 10404117 12898505 140165127
2 147562000 142652000 737000 707000 745000 51250000 51604000 50992000 23900000 25404000 26269000 69000 66000 56000 0 0 0 0 0 0 17498000 12270000 12954000 31359000 47731000 43044000 26022000 43298000 39653000 1801000 1761000 1171000 3000 3000 0 19082000 4224000 5958000 1596000 7000 ... 9901000 10029000 9581000 3832000 3762000 3497000 -6933000 683000 2125000 2624000 2743000 1937000 53000 59000 59000 915000 1122000 1153000 -1000 3000 -10000 0 0 -36000 9000 1166000 1450000 -803000 3170000 2408000 116 165 168 1801000 1761000 1171000 31861000 22494000 24687000 145190000
3 147067625 140160124 39316 5715 8015 5258584 5491288 5735791 4183889 4354175 4510666 25917 25917 25917 25917 25917 25917 0 0 0 25923161 17921979 19627665 21948953 22868722 23881755 15584991 15576621 23831452 890646 0 0 0 2226616 0 185206 54173 1151195 643650 567191 ... 3075383 3296773 3290801 631775 638706 725484 -1868447 -306450 3770484 421274 885471 377168 217482 149383 155720 428722 439358 525316 34150 95776 -114667 0 0 0 80381 185005 258220 478455 494325 473999 60 60 65 890646 0 0 1000000 0 1188439 112760368
4 145273739 127413305 6571906 3613836 3218188 65670888 56917531 51856050 20251827 20965972 20661743 2324116 2070401 179562 1754388 1625505 178366 0 1196 1196 18950865 16878270 18797216 8362491 16303624 14624193 4425910 11432236 11013435 0 0 0 0 0 0 14381920 10465984 10529721 1463431 707312 ... 6335628 5294076 4974163 5310610 4361914 3345706 -2235901 3075212 -2049351 1676826 1524379 2270501 31436 12726 60454 781590 787832 1051073 -481 -95 -697 0 0 0 472994 795056 943092 966348 1243727 869544 171 141 132 0 0 0 27325115 24037285 24492237 171980718

5 rows × 135 columns

data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7479 entries, 0 to 7478
Columns: 135 entries, var1 to costi
dtypes: int64(135)
memory usage: 7.7 MB
data.describe()
var1 var2 var6 var7 var8 var9 var10 var11 var12 var13 var14 var15 var16 var17 var18 var19 var20 var21 var22 var23 var27 var28 var29 var30 var31 var32 var33 var34 var35 var39 var40 var41 var42 var43 var44 var45 var46 var47 var48 var49 ... var108 var109 var110 var111 var112 var113 var114 var115 var116 var123 var124 var125 var126 var127 var128 var129 var130 var131 var132 var133 var134 var135 var136 var137 var141 var142 var143 var147 var148 var149 var150 var151 var152 var153 var154 var155 var156 var157 var158 costi
count 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 ... 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7479.000000 7479.000000 7479.000000 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03 7.479000e+03
mean 7.450131e+06 6.995936e+06 3.377709e+05 2.954539e+05 2.374321e+05 2.080609e+06 1.977554e+06 1.866474e+06 9.678114e+05 9.397979e+05 8.494891e+05 2.730416e+05 2.483007e+05 2.281703e+05 1.867172e+05 1.722472e+05 1.578191e+05 2.484367e+04 2.069652e+04 1.858893e+04 1.325843e+06 1.256179e+06 1.160876e+06 2.188183e+06 2.161061e+06 2.019932e+06 1.505207e+06 1.506826e+06 1.418052e+06 6.012760e+04 5.466102e+04 5.602265e+04 9.828008e+04 8.966333e+04 6.497756e+04 5.901713e+05 5.677180e+05 5.011291e+05 5.604334e+04 5.130161e+04 ... 8.240549e+05 7.776656e+05 7.208212e+05 2.562921e+05 2.444049e+05 2.225539e+05 -2.048778e+04 -2.213982e+04 -2.373427e+04 9.113039e+04 8.943493e+04 8.721292e+04 1.517198e+04 1.664002e+04 1.264100e+04 5.208721e+04 4.878168e+04 5.321769e+04 -2.687684e+02 -2.391546e+03 3.375687e+02 -6.536953e+03 -4.234294e+03 -4.462733e+03 8.603223e+04 8.922157e+04 9.986623e+04 1.745804e+05 1.562779e+05 1.578679e+05 21.127958 20.665062 19.299773 6.772403e+04 6.175881e+04 6.265347e+04 1.076427e+06 9.915747e+05 9.137636e+05 7.633431e+06
std 1.646081e+07 1.588951e+07 6.690769e+06 5.441119e+06 2.622168e+06 5.869370e+06 5.642296e+06 5.481200e+06 3.769399e+06 3.708256e+06 3.013458e+06 1.941856e+06 1.788020e+06 1.698509e+06 1.564896e+06 1.488300e+06 1.409354e+06 4.182777e+05 3.335736e+05 2.810225e+05 4.068137e+06 3.848374e+06 3.627495e+06 4.826904e+06 4.757523e+06 4.640312e+06 3.604305e+06 3.616744e+06 3.454110e+06 4.522008e+05 3.798973e+05 3.431355e+05 1.457817e+06 1.281663e+06 9.854663e+05 2.142858e+06 2.123655e+06 1.775132e+06 2.349897e+05 2.311300e+05 ... 2.012542e+06 1.902859e+06 1.754317e+06 9.203786e+05 8.755274e+05 6.934450e+05 5.676930e+05 4.581167e+05 4.757960e+05 2.798760e+05 3.139860e+05 3.194439e+05 2.030893e+05 3.315507e+05 1.329318e+05 2.667476e+05 1.764451e+05 2.018601e+05 3.163845e+04 6.514876e+04 3.858767e+04 3.112814e+05 8.613143e+04 1.057758e+05 3.406345e+05 3.400689e+05 3.614035e+05 1.173476e+06 1.160223e+06 9.680606e+05 46.321574 46.255558 44.001820 4.963626e+05 4.301135e+05 3.964864e+05 4.537630e+06 3.720963e+06 3.141080e+06 1.683081e+07
min 4.150000e+02 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 -2.239386e+07 -1.435085e+07 -1.746800e+07 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 -1.448299e+06 -1.556761e+06 -1.277128e+06 -2.626785e+07 -3.700000e+06 -7.439191e+06 -5.930205e+06 -2.872546e+06 -1.594206e+06 -3.335908e+07 -6.080911e+07 -3.185507e+07 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
25% 5.184700e+05 4.002950e+05 6.975000e+02 7.300000e+02 3.775000e+02 6.432050e+04 5.949900e+04 3.985650e+04 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 2.053550e+04 1.949950e+04 1.205200e+04 1.559340e+05 1.477175e+05 1.042130e+05 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1.458000e+04 1.615450e+04 1.075200e+04 6.000000e+02 5.625000e+02 ... 9.125000e+04 8.197950e+04 5.863450e+04 1.444350e+04 1.248900e+04 9.404500e+03 -1.659450e+04 -2.148750e+04 -1.365000e+04 7.652500e+03 6.483000e+03 5.034000e+03 0.000000e+00 0.000000e+00 0.000000e+00 1.503500e+03 1.203000e+03 7.910000e+02 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1.754500e+03 1.750000e+03 8.245000e+02 1.013000e+03 1.045000e+03 0.000000e+00 4.000000 4.000000 3.000000 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 5.438740e+05
50% 1.695363e+06 1.533985e+06 1.381500e+04 1.238800e+04 9.955000e+03 3.347110e+05 3.155580e+05 2.630320e+05 0.000000e+00 0.000000e+00 0.000000e+00 6.410000e+02 6.030000e+02 3.650000e+02 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1.257350e+05 1.193750e+05 1.008450e+05 5.375490e+05 5.138090e+05 4.571890e+05 2.296820e+05 2.075290e+05 1.797610e+05 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 6.617500e+04 6.966300e+04 5.812100e+04 5.084000e+03 4.535000e+03 ... 2.699830e+05 2.525620e+05 2.204670e+05 4.847200e+04 4.608400e+04 4.158700e+04 0.000000e+00 -5.000000e+01 0.000000e+00 2.294600e+04 2.142400e+04 1.997200e+04 1.400000e+01 2.000000e+01 2.400000e+01 8.236000e+03 7.691000e+03 7.702000e+03 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1.053200e+04 1.050300e+04 1.029100e+04 1.601000e+04 1.603900e+04 1.249900e+04 10.000000 9.000000 9.000000 0.000000e+00 0.000000e+00 0.000000e+00 1.145440e+05 1.071490e+05 8.262500e+04 1.760361e+06
75% 6.050798e+06 5.594102e+06 8.305000e+04 7.726200e+04 6.841050e+04 1.514472e+06 1.470158e+06 1.357036e+06 1.525510e+05 1.505230e+05 8.037500e+03 1.238250e+04 1.148500e+04 9.743000e+03 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 7.469065e+05 7.242070e+05 6.505790e+05 1.876297e+06 1.835515e+06 1.688428e+06 1.212872e+06 1.228060e+06 1.120250e+06 8.460000e+02 1.243500e+03 1.800000e+03 0.000000e+00 0.000000e+00 0.000000e+00 3.167980e+05 3.178475e+05 2.838085e+05 2.642000e+04 2.368550e+04 ... 6.861245e+05 6.513525e+05 6.123230e+05 1.769925e+05 1.651410e+05 1.541010e+05 6.946000e+03 4.335500e+03 4.780500e+03 7.280900e+04 6.995300e+04 6.879700e+04 5.370000e+02 7.200000e+02 6.890000e+02 3.309400e+04 3.361550e+04 3.634900e+04 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 4.810350e+04 4.970400e+04 5.255950e+04 9.653200e+04 9.412450e+04 8.194550e+04 20.000000 20.000000 19.000000 3.167000e+03 3.810500e+03 4.782000e+03 6.487320e+05 6.149110e+05 5.736580e+05 6.073618e+06
max 1.478231e+08 1.954054e+08 3.949990e+08 4.236760e+08 1.218370e+08 1.822085e+08 1.819357e+08 1.819572e+08 1.814344e+08 1.814863e+08 5.161911e+07 5.580400e+07 5.208281e+07 5.203800e+07 5.255939e+07 5.207929e+07 5.203700e+07 2.095430e+07 1.845768e+07 1.460000e+07 9.040429e+07 8.979462e+07 8.604582e+07 8.066182e+07 7.479026e+07 9.178638e+07 7.180935e+07 6.493519e+07 5.631069e+07 1.943005e+07 1.777804e+07 1.030000e+07 9.020468e+07 7.449480e+07 6.477029e+07 5.985522e+07 6.518876e+07 4.206783e+07 8.187088e+06 9.092467e+06 ... 6.004269e+07 5.354036e+07 3.470024e+07 5.065200e+07 4.429233e+07 2.746300e+07 1.973944e+07 8.680905e+06 8.962435e+06 1.006186e+07 1.105028e+07 1.301970e+07 1.048585e+07 2.550890e+07 6.901368e+06 1.134100e+07 6.313221e+06 6.851000e+06 6.784740e+05 3.876515e+06 1.553908e+06 9.993120e+05 1.840159e+06 6.716100e+05 9.545512e+06 7.919978e+06 1.002997e+07 1.838250e+07 1.384342e+07 2.333264e+07 1384.000000 1307.000000 1244.000000 1.943005e+07 1.777804e+07 1.662483e+07 2.025610e+08 1.872810e+08 1.169022e+08 1.754113e+08

8 rows × 135 columns

X = data.iloc[:,0:-1]

y = data['costi']
X.shape
(7479, 134)
y.shape
(7479,)
# Split the data into a training set and a test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
(5235, 134) (2244, 134) (5235,) (2244,)
from sklearn.linear_model import LinearRegression
model = LinearRegression(normalize=True)
model.fit( X_train, y_train )
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)
model.intercept_
-3200.777457194403
model.coef_
array([-2.30996462e-03, -1.50058663e-02, -2.81813068e-02,  1.84983370e-01,
       -1.07535485e-01, -1.25291295e-02,  1.39249816e-01, -9.91288975e-02,
        4.14284936e-03, -4.74053837e-03, -4.09265041e-04, -1.07988920e-02,
        5.53436241e-02, -4.89814741e-02, -2.08352209e-02,  7.25301721e-02,
       -3.77319133e-02,  8.76194678e-02, -7.65657544e-02,  1.17948143e-01,
       -9.30513956e-01,  1.08357037e+00, -1.26653220e-01,  4.79202414e-03,
        1.43007817e-01, -1.15888295e-01,  9.94973398e-03,  2.70744978e-04,
       -7.92966224e-03,  2.69702385e-02, -4.17703102e-02,  1.29981403e-02,
       -2.03380493e-02,  1.65893118e-01, -1.27604963e-01, -2.66074722e-02,
        1.79488728e-01, -1.18126746e-01, -4.34200642e-02,  1.21637354e-02,
       -1.04109145e-02,  5.60581055e-03, -6.01850048e-02,  4.47773476e-02,
        2.20944502e-02, -1.13635756e-01,  7.64066351e-02,  1.19314776e-04,
        9.93569282e-03, -1.59561082e-02, -7.39703203e-02, -1.36109362e-01,
        1.99741967e-01,  1.20559367e-01, -6.27703586e-02, -1.04838456e-01,
        2.16557279e-03, -9.92887862e-02,  7.54038176e-02,  2.07529998e-03,
        4.41791060e-03,  9.55652299e-04, -5.91073869e-03, -4.61698720e-02,
       -3.67701369e-03,  4.96002655e-02,  3.53159925e-03, -1.41144644e-02,
        1.63627563e-02,  2.72654249e-02,  1.04871684e-02, -3.80283405e-02,
       -1.08564554e-02,  1.39354551e-02,  1.32101117e-03, -4.13592564e-03,
       -5.18201426e-03, -2.56661936e-02, -1.70815551e-01,  1.63681451e-01,
        1.24688231e+00, -4.47986551e-02, -9.86953813e-02, -9.54673606e-01,
        5.20125142e-02, -1.03908251e-01, -2.52766571e-01,  4.74436896e-02,
        1.18334001e-01, -2.33639979e-01,  7.00241620e-02,  9.27074982e-02,
       -3.03265154e-01,  8.86666687e-02,  8.77259902e-02, -2.27952770e-01,
       -1.43269119e-02,  1.39954894e-01, -3.04086786e-01,  1.11272269e-01,
        3.04783758e-02, -1.19847308e+00,  1.00332024e-02,  1.20315710e-01,
       -1.96644059e-01, -9.27282850e-02,  1.83966606e-01,  1.92388403e-01,
        1.04369772e-01, -1.97001878e-01, -8.36998547e-02, -1.21567478e-01,
        8.16877455e-02, -5.26305994e-01, -3.52562902e-02, -1.94390763e-01,
        2.07845429e-01, -8.20499899e-01, -1.30818522e-03, -3.24226028e-01,
        1.05224108e-01,  5.12933641e-02, -2.29442019e-01, -2.68507534e-03,
        1.74531598e-01,  6.62690395e+02, -2.30495434e+02,  2.22724695e+02,
       -2.02134192e-02,  1.36489920e-02,  9.06889487e-03,  2.07529998e-03,
        4.41791060e-03,  9.55652299e-04])
y_predict = model.predict(X_test)
import numpy as np
length = y_predict.shape[0] #  
x = np.linspace(0,length,length)


plt.figure(figsize=(30,15))
plt.plot(x, y_test, label='costi_reali')
plt.plot(x, y_predict, label='costi_predetti')
plt.legend(loc=2);
../../_images/regressore_custom_data_17_0.png

Export coeff of regressor

coeff = model.coef_
coeff

### EXPORT THE COEFF OF REGRESSOR
formula_excel =pd.DataFrame(coeff, columns=['coeff'])
formula_excel.to_excel('formula.xlsx', index=False)

Export Prediction in a final column

data['costi_PRED']=model.predict(data.iloc[:,0:-1])
data[['costi', 'costi_PRED']].head()
costi costi_PRED
0 144205134 1.443731e+08
1 140165127 1.379588e+08
2 145190000 1.444782e+08
3 112760368 1.130755e+08
4 171980718 1.734026e+08
#save the data
data.to_excel('result1.xlsx', index=False)

Export model

import pickle
pickle.dump( model, open( 'model.p', 'wb' ) )
# modello di regressione lineare con tutte le 134 Xi

Import the model (pickle) with new data

from google.colab import files
uploaded = files.upload()
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving test_cost.xlsx to test_cost.xlsx
#Loading a new dataset
import pandas as pd 
import numpy as np

df = pd.read_excel('test_cost.xlsx')

X = df.iloc[:,0:-1]
y = df['costi']
X.shape
(100, 134)
#Load the pickled object. 
model = pickle.load( open( "model.p", "rb" ) )
y_predict = model.predict(X)
df['costi_PRED'] = model.predict(X)
df[['costi', 'costi_PRED']].head()
costi costi_PRED
0 2508929000 2.522365e+09
1 2173873000 2.171454e+09
2 1526367000 1.572747e+09
3 1174142000 1.166190e+09
4 0 -4.325780e+07
length = y.shape[0] #  
x = np.linspace(0,length,length)


plt.figure(figsize=(20,15))
plt.plot(x, y, label='costi_reali')
plt.plot(x, y_predict, label='costi_predetti')
plt.legend(loc=2);
../../_images/regressore_custom_data_32_0.png
#save the data
df.to_excel('result_new_dataset.xlsx', index=False)

Reduce number of regressor 8 regressors

data2 = data.copy()
del data2['costi_PRED']
corrmat = data2.corr()
corrmat
var1 var2 var6 var7 var8 var9 var10 var11 var12 var13 var14 var15 var16 var17 var18 var19 var20 var21 var22 var23 var27 var28 var29 var30 var31 var32 var33 var34 var35 var39 var40 var41 var42 var43 var44 var45 var46 var47 var48 var49 ... var108 var109 var110 var111 var112 var113 var114 var115 var116 var123 var124 var125 var126 var127 var128 var129 var130 var131 var132 var133 var134 var135 var136 var137 var141 var142 var143 var147 var148 var149 var150 var151 var152 var153 var154 var155 var156 var157 var158 costi
var1 1.000000 0.986345 0.135692 0.130403 0.235256 0.652578 0.639252 0.625818 0.522128 0.517458 0.624950 0.415115 0.386054 0.369241 0.381076 0.351668 0.337167 0.187145 0.182562 0.213661 0.643967 0.652575 0.653724 0.885616 0.901770 0.889494 0.849578 0.871460 0.873803 0.206912 0.217005 0.260878 0.152898 0.170960 0.167835 0.561514 0.529838 0.551337 0.342868 0.313105 ... 0.738153 0.747532 0.763650 0.557382 0.591085 0.674079 -0.081304 -0.108396 -0.161343 0.458148 0.441197 0.424094 0.188005 0.141545 0.265174 0.387952 0.532638 0.535435 -0.033266 -0.075452 0.034593 -0.030284 -0.157753 -0.136756 0.503319 0.566754 0.620623 0.374695 0.314028 0.392427 0.639018 0.626608 0.642236 0.188735 0.188945 0.223395 0.456726 0.488830 0.537188 0.986153
var2 0.986345 1.000000 0.183263 0.193421 0.267485 0.656802 0.644170 0.635073 0.529413 0.525425 0.638663 0.416329 0.389934 0.370164 0.378868 0.351583 0.336201 0.188228 0.182147 0.214706 0.637955 0.648195 0.653929 0.883733 0.895384 0.894533 0.852143 0.868651 0.883315 0.217426 0.220016 0.264968 0.152903 0.168143 0.169033 0.553340 0.528199 0.553966 0.330029 0.305480 ... 0.726053 0.733222 0.767513 0.594631 0.603129 0.712480 -0.078030 -0.105031 -0.157499 0.464254 0.447246 0.434021 0.186468 0.152456 0.271204 0.413598 0.548888 0.565061 -0.065981 -0.084900 0.030522 -0.042834 -0.174988 -0.136803 0.465343 0.548087 0.623318 0.341054 0.290481 0.381861 0.630293 0.620489 0.642667 0.198160 0.191570 0.226940 0.476820 0.518401 0.542348 0.976198
var6 0.135692 0.183263 1.000000 0.873993 0.813696 0.189718 0.182933 0.165854 0.131329 0.127274 0.139800 0.335777 0.130861 0.101365 0.259090 0.096747 0.087498 0.003349 0.003959 0.006180 0.113859 0.109382 0.099655 0.148542 0.155342 0.128628 0.128418 0.143798 0.123677 0.097007 0.055760 0.051068 0.005120 0.005173 0.008444 0.089825 0.176643 0.165656 0.055283 0.023465 ... 0.184832 0.138871 0.182232 0.674075 0.334837 0.517908 0.001620 -0.048689 -0.013749 0.107458 0.085651 0.087308 0.059003 0.078457 0.122742 0.622851 0.224365 0.521631 -0.216426 0.005878 -0.069827 -0.004018 0.025614 -0.184567 -0.090820 0.202786 0.202446 -0.125143 0.031223 0.106843 0.145932 0.139871 0.135043 0.087843 0.048679 0.043582 0.752713 0.479469 0.243378 0.176026
var7 0.130403 0.193421 0.873993 1.000000 0.725537 0.226065 0.216583 0.194307 0.151139 0.145181 0.156576 0.219797 0.143006 0.104691 0.167873 0.099393 0.089165 0.002658 0.002687 0.006144 0.125486 0.113749 0.104222 0.166999 0.175232 0.145167 0.156505 0.176031 0.147110 0.129380 0.052250 0.051619 0.007890 0.007631 0.012758 0.103887 0.144036 0.178280 0.040511 0.028061 ... 0.189389 0.127823 0.182980 0.748622 0.354451 0.591595 -0.006842 -0.050565 -0.008803 0.114722 0.079594 0.102993 0.053131 0.101960 0.150623 0.498843 0.280969 0.427650 -0.335108 0.048683 -0.111728 -0.003179 0.028650 -0.200492 -0.115681 0.134505 0.202609 -0.158951 -0.029327 0.084978 0.151873 0.147328 0.136093 0.117306 0.045500 0.044010 0.652591 0.635520 0.317137 0.184230
var8 0.235256 0.267485 0.813696 0.725537 1.000000 0.265146 0.266050 0.246782 0.206811 0.205262 0.207759 0.363357 0.211711 0.196459 0.298309 0.178568 0.184040 0.006438 0.006876 0.014061 0.157938 0.157350 0.144310 0.222544 0.228397 0.213953 0.195191 0.206778 0.197445 0.078609 0.065872 0.071788 0.020384 0.019720 0.025038 0.147141 0.210100 0.266617 0.084317 0.061866 ... 0.247220 0.235928 0.261570 0.617437 0.510830 0.597908 -0.008968 -0.097494 -0.007816 0.167191 0.169458 0.136359 0.073326 0.214011 0.352343 0.600951 0.449527 0.642732 -0.081671 0.264306 -0.198528 -0.002500 0.070876 -0.465443 0.017319 0.243098 0.260698 -0.111626 0.026316 0.082273 0.197056 0.190604 0.191515 0.070684 0.057094 0.061017 0.708195 0.452155 0.439678 0.243066
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
var155 0.223395 0.226940 0.043582 0.044010 0.061017 0.202353 0.202773 0.201791 0.166981 0.163899 0.199016 0.128268 0.126408 0.120588 0.118179 0.114071 0.110688 0.015444 0.016602 0.023159 0.138535 0.145335 0.156384 0.305291 0.327402 0.325720 0.251763 0.262242 0.250124 0.576255 0.675614 0.862762 0.004004 0.003298 0.012819 0.125483 0.102116 0.109212 0.134293 0.113861 ... 0.171965 0.176136 0.183390 0.177772 0.171289 0.206994 -0.010671 0.017562 -0.024172 0.152030 0.140095 0.179137 0.051656 0.034954 0.154785 0.188886 0.220122 0.223574 -0.057395 -0.054727 0.035122 -0.017513 -0.055084 -0.011086 0.088607 0.100526 0.105844 0.019526 0.030046 0.055176 0.137069 0.136641 0.141692 0.722125 0.827372 1.000000 0.192632 0.232049 0.277538 0.219139
var156 0.456726 0.476820 0.752713 0.652591 0.708195 0.578031 0.564299 0.533198 0.458980 0.444540 0.504364 0.452596 0.297705 0.269472 0.397359 0.274203 0.262533 0.086206 0.092490 0.101815 0.435457 0.433399 0.409762 0.469106 0.483043 0.454719 0.410637 0.430182 0.412124 0.210692 0.154981 0.199667 0.042844 0.038677 0.016677 0.260620 0.313495 0.305936 0.227179 0.178598 ... 0.415474 0.386672 0.415266 0.730107 0.523319 0.671638 -0.040482 -0.092845 -0.093796 0.289609 0.283453 0.313727 0.136388 0.151322 0.281111 0.693159 0.540685 0.713428 -0.154815 0.045929 -0.046221 -0.024945 -0.017496 -0.296575 0.069369 0.319128 0.329185 -0.012658 0.103984 0.169218 0.349209 0.335781 0.335507 0.207735 0.154641 0.192632 1.000000 0.800378 0.698905 0.482840
var157 0.488830 0.518401 0.479469 0.635520 0.452155 0.656210 0.645442 0.611144 0.515172 0.502036 0.573969 0.323258 0.336961 0.292983 0.285183 0.290790 0.266979 0.104310 0.115773 0.123223 0.454784 0.451243 0.440515 0.516594 0.539544 0.516462 0.474865 0.500620 0.481140 0.237844 0.181698 0.239888 0.041072 0.037496 0.023761 0.279626 0.299713 0.313378 0.232222 0.210832 ... 0.438662 0.400436 0.440466 0.748071 0.511556 0.704817 -0.049675 -0.079458 -0.124970 0.330287 0.311307 0.359791 0.153271 0.247595 0.213060 0.546249 0.626313 0.583229 -0.258613 -0.054892 -0.005612 -0.066081 -0.086703 -0.165852 0.074250 0.241839 0.319580 -0.000928 0.072383 0.144425 0.378126 0.368310 0.368484 0.236244 0.182645 0.232049 0.800378 1.000000 0.777121 0.524987
var158 0.537188 0.542348 0.243378 0.317137 0.439678 0.682216 0.684901 0.684083 0.570262 0.562681 0.652159 0.353212 0.377000 0.368508 0.333145 0.350430 0.352395 0.107082 0.116334 0.123803 0.483165 0.489269 0.490547 0.548446 0.567178 0.554446 0.483307 0.495304 0.490955 0.214167 0.222927 0.286564 0.045740 0.041128 0.024487 0.277906 0.272270 0.341752 0.245506 0.231290 ... 0.446247 0.449236 0.471581 0.555260 0.553331 0.643753 -0.069377 -0.074575 -0.131576 0.353918 0.354668 0.399339 0.152856 0.081858 0.402450 0.427419 0.662039 0.669947 -0.068658 0.173842 -0.114673 -0.078316 0.013429 -0.420892 0.207181 0.285600 0.296679 0.049811 0.051782 0.095128 0.387896 0.380843 0.393833 0.217472 0.223513 0.277538 0.698905 0.777121 1.000000 0.535850
costi 0.986153 0.976198 0.176026 0.184230 0.243066 0.663195 0.645689 0.623935 0.526338 0.519144 0.620732 0.412508 0.386070 0.362460 0.375275 0.349002 0.328834 0.185970 0.182333 0.213124 0.651736 0.654572 0.638381 0.892665 0.896231 0.874316 0.865041 0.873108 0.864293 0.205703 0.211874 0.256292 0.151247 0.170439 0.164481 0.562900 0.519217 0.545312 0.340801 0.312354 ... 0.758261 0.753295 0.765291 0.595142 0.576023 0.690973 -0.104126 -0.131195 -0.162064 0.458831 0.423249 0.409725 0.188688 0.140951 0.248732 0.400310 0.515660 0.524363 -0.062510 -0.096260 0.036555 -0.032482 -0.157398 -0.124410 0.507332 0.565530 0.625243 0.387224 0.337390 0.416265 0.654021 0.631454 0.640518 0.187614 0.184095 0.219139 0.482840 0.524987 0.535850 1.000000

135 rows × 135 columns

fig, ax = plt.subplots(figsize = (18, 10))
sns.heatmap(corrmat, annot = True, annot_kws={'size': 12});
../../_images/regressore_custom_data_37_0.png
corrmat.index.values
array(['var1', 'var2', 'var6', 'var7', 'var8', 'var9', 'var10', 'var11',
       'var12', 'var13', 'var14', 'var15', 'var16', 'var17', 'var18',
       'var19', 'var20', 'var21', 'var22', 'var23', 'var27', 'var28',
       'var29', 'var30', 'var31', 'var32', 'var33', 'var34', 'var35',
       'var39', 'var40', 'var41', 'var42', 'var43', 'var44', 'var45',
       'var46', 'var47', 'var48', 'var49', 'var50', 'var51', 'var52',
       'var53', 'var54', 'var55', 'var56', 'var57', 'var58', 'var59',
       'var60', 'var61', 'var62', 'var63', 'var64', 'var65', 'var66',
       'var67', 'var68', 'var69', 'var70', 'var71', 'var72', 'var73',
       'var74', 'var75', 'var76', 'var77', 'var78', 'var79', 'var80',
       'var81', 'var82', 'var83', 'var84', 'var85', 'var86', 'var90',
       'var91', 'var92', 'var93', 'var94', 'var95', 'var96', 'var97',
       'var98', 'var99', 'var100', 'var101', 'var102', 'var103', 'var104',
       'var105', 'var106', 'var107', 'var108', 'var109', 'var110',
       'var111', 'var112', 'var113', 'var114', 'var115', 'var116',
       'var123', 'var124', 'var125', 'var126', 'var127', 'var128',
       'var129', 'var130', 'var131', 'var132', 'var133', 'var134',
       'var135', 'var136', 'var137', 'var141', 'var142', 'var143',
       'var147', 'var148', 'var149', 'var150', 'var151', 'var152',
       'var153', 'var154', 'var155', 'var156', 'var157', 'var158',
       'costi'], dtype=object)
def getCorrelatedFeature(corrdata, threshold):
    feature = []
    value = []
    
    for i, index in enumerate(corrdata.index):
        if abs(corrdata[index])> threshold:
            feature.append(index)
            value.append(corrdata[index])
            
    df = pd.DataFrame(data = value, index = feature, columns=['Corr Value'])
    return df
threshold = 0.90
corr_value = getCorrelatedFeature(corrmat['costi'], threshold)
corr_value
Corr Value
var1 0.986153
var2 0.976198
var93 0.998334
var94 0.985817
var95 0.975280
var99 0.959306
var100 0.946764
var101 0.936186
costi 1.000000
corr_value.index.values
array(['var1', 'var2', 'var93', 'var94', 'var95', 'var99', 'var100',
       'var101', 'costi'], dtype=object)
correlated_data = data2[corr_value.index]
correlated_data.head()
var1 var2 var93 var94 var95 var99 var100 var101 costi
0 147823139 148193655 145914503 148992105 149021406 85824629 86758885 83651116 144205134
1 147562322 163251010 142727898 150222876 166417107 102237238 113037130 131392006 140165127
2 147562000 142652000 148757000 153783000 148906000 117422000 111227000 108035000 145190000
3 147067625 140160124 119349203 145528364 143260149 105779603 129982682 121831610 112760368
4 145273739 127413305 174796438 148011089 130682792 149110551 119058121 106955719 171980718
correlated_data.shape
(7479, 9)
sns.pairplot(correlated_data)
plt.tight_layout()
../../_images/regressore_custom_data_44_0.png
sns.heatmap(correlated_data.corr(), annot=True, annot_kws={'size': 12});
../../_images/regressore_custom_data_45_0.png
X = correlated_data.drop(labels=['costi'], axis = 1)
y = correlated_data['costi']
X.head()
var1 var2 var93 var94 var95 var99 var100 var101
0 147823139 148193655 145914503 148992105 149021406 85824629 86758885 83651116
1 147562322 163251010 142727898 150222876 166417107 102237238 113037130 131392006
2 147562000 142652000 148757000 153783000 148906000 117422000 111227000 108035000
3 147067625 140160124 119349203 145528364 143260149 105779603 129982682 121831610
4 145273739 127413305 174796438 148011089 130682792 149110551 119058121 106955719
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape
((5983, 8), (1496, 8))
model = LinearRegression(normalize=True,fit_intercept=True)
model.fit(X_train, y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)
y_predict = model.predict(X_test)
df = pd.DataFrame(data = [y_predict, y_test])
df = df.T
df.columns = ['costi', 'costi_PRED']
df
costi costi_PRED
0 3.664867e+06 3534300.0
1 7.809564e+06 7874798.0
2 1.152198e+08 115850068.0
3 1.010573e+06 1048631.0
4 3.258395e+06 3348815.0
... ... ...
1491 9.072874e+06 9303572.0
1492 2.374781e+06 2310607.0
1493 5.624134e+06 5696533.0
1494 5.119308e+06 5233938.0
1495 2.547442e+05 295468.0

1496 rows × 2 columns

length = y_predict.shape[0] #  
x = np.linspace(0,length,length)

plt.figure(figsize=(20,15))
plt.plot(x, y_test, label='test')
plt.plot(x, y_predict, label='predetto')
plt.legend(loc=2);
../../_images/regressore_custom_data_52_0.png
model.intercept_
-27689.42884542793
coeff = model.coef_
coeff
array([ 0.10767113,  0.3805769 ,  0.93862885, -0.09831021, -0.34587131,
        0.00826232,  0.04751049, -0.04413217])
label = np.asarray(correlated_data.columns)
label
array(['var1', 'var2', 'var93', 'var94', 'var95', 'var99', 'var100',
       'var101', 'costi'], dtype=object)
export = pd.DataFrame({
    'coeff':[0.10767113,  0.3805769 ,  0.93862885, -0.09831021, -0.34587131,0.00826232,  0.04751049, -0.04413217],
    'label':['var1', 'var2', 'var93', 'var94', 'var95', 'var99', 'var100','var101'] })
export[['coeff','label']]
coeff label
0 0.107671 var1
1 0.380577 var2
2 0.938629 var93
3 -0.098310 var94
4 -0.345871 var95
5 0.008262 var99
6 0.047510 var100
7 -0.044132 var101
export.to_excel('ridotta.xlsx', index=False)
## testare la formua con l'excel di test