线性回归评价标准
In [1]:
1 2 3 4 5 | import numpy as np from sklearn.linear_model import LinearRegression import sklearn.datasets as datasets |
1 2 | () diabetes diabetes $=$datasets.load_diabetes |
In [2]:
Out[2]: {‘data’: array([[ 0.03807591,0.05068012,0.06169621,…,-0.00259226,
0.01990842,-0.01764613],
[-0.00188202,-0.04464164,-0.05147406,…,-0.03949338,
-0.06832974,-0.09220405],
[0.08529891,0.05068012, 0.04445121,… -0.00259226,
0.00286377,-0.02593034],
···,
[0.04170844, 0.05068012,-0.01590626,…,-0.01107952,
-0.04687948,0.01549073],
[-0.04547248,-0.04464164,0.03906215, .0.02655962,
0.04452837,-0.02593034],
[-0.04547248,-0.04464164,-0.0730303,…,-0.03949338,
-0.00421986, 0.00306441]]),
‘target’: array([151., 75.,141.,206.,135., 97.,138.,63.,110.,310.,101.,
69.,179.,185.,118.,171.,166.,144.,97.,168.,68.,49.,
68.,245.,184.,202.,137.,85.,131.,283.,129.,59.,341.,
87.,65.,102.,265.,276.,252.,90.,100.,55.,61.,92.,
259.,53.,190.,142.,75.,142.,155.,225.,59.,104.,182.,
128.,52.,37.,170.,170.,61.,144.,52.,128.,71.,163.,
150.,97.,160.,178.,48.,270.,202.,111.,85.,42.,170.,
200.,252.,113.,143.,51.,52.,210.,65.,141.,55.,134.,
42.,111.,98.,164.,48.,96.,90.,162.,150.,279.,92.,
83.,128.,102.,302.,198.,95.,53.,134.,144.,232.,81.,
104.,59.,246.,297.,258.,229.,275.,281.,179.,200.,200.,
173.,180.,84.,121.,161.,99.,109.,115.,268.,274.,158.,
107.,83.,103.,272.,85.,280.,336.,281.,118.,317.,235.,
60.,174.,259.,178.,128.,96.,126.,288.,88.,292.,71.,
197.,186.,25.,84.,96.,195.,53.,217.,172.,131.,214.,
59.,70.,220.,268.,152.,47.,74.,295.,101.,151.,127.,
237.,225.,81.,151.,107.,64.,138.,185.,265.,101.,137.,
143.,141.,79.,292.,178.,91.,116.,86.,122.,72.,129.,
142.,90.,158.,39.,196.,222.,277.,99.,196.,202.,155.,
77.,191.,70.,73.,49.,65.,263.,248.,296.,214.,185.,
78.,93.,252.,150.,77.,208.,77.,108.,160.,53.,220.,
154.,259.,90.,246.,124.,67.,72.,257.,262.,275.,177.,
71.,47.,187.,125.,78.,51.,258.,215.,303.,243.,91.,
150.,310.,153.,346.,63.,89.,50.,39.,103.,308.,116.,
145.,74.,45.,115.,264.,87.,202.,127.,182.,241.,66.,
94.,283.,64.,102.,200.,265.,94.,230.,181.,156.,233.,
60.,219.,80.,68.,332.,248.,84.,200.,55.,85.,89.,
31.,129.,83.,275.,65.,198.,236.,253.,124.,44.,172.,
114.,142.,109.,180.,144.,163.,147.,97.,220.,190.,109.,
191.,122.,230.,242.,248.,249.,192.,131.,237.,78.,135.,
244.,199.,270.,164.,72.,96.,306.,91.,214.,95.,216.,
263.,178.,113.,200.,139.,139.,88.,148.,88.,243.,71.,
77.,109.,272.,60.,54.,221.,90.,311.,281.,182.,321.,
58.,262.,206.,233.,242.,123.,167.,63.,197.,71.,168.,
140.,217.,121.,235.,245.,40.,52.,104.,132.,88.,69.,
219.,72.,201.,110.,51.,277.,63.,118.,69.,273.,258.,
43.,198.,242.,232.,175.,93.,168.,275.,293.,281.,72.,
140.,189.,181.,209.,136.,261.,113.,131.,174.,257.,55.,
84.,42.,146.,212.,233.,91.,111.,152.,120.,67.,310.,
94.,183.,66.,173.,72.,49.,64.,48.,178.,104.,132.,
220.,57.]),
‘DESCR’: 'Diabetes dataset\ \ n = = = = \backslash n==== \n==== = = = = 1 ====1 ====1 n\nNotes\n-----\n\nTen baseline variables, age, s ex, body mass index, average blood\npressure, and six blood serum measurements were obtained fo r each of n = \ n 442 \mathrm {n}=\backslash \mathrm {n}442 n=\n442 diabetes patients, as well as the response of interest, a\nquantitative meas ure of disease progression one year after baseline.\n\nData Set Characteristics:\n\n :Number o f Instances: 442 \ n \ n 442\backslash \mathrm {n}\backslash \mathrm {n} 442\n\n :Number of Attributes: First 10 columns are numeric predictive values\n\n :Target: Column 11 is a quantitative measure of disease progression one year after baseline\ n \ n n\backslash n n\n :Attributes:\n :Age:\n :Sex:\n :Body mass index:\n :Average blood pressure:\n :S
1 : \ n 1:\backslash \mathrm {n} 1:\n : S 2 : n S2:\ n S2: n : S 3 : n S3:\ n S3: n : S 4 : \ n S4:\backslash n S4:\n : S 5 : \ n S5:\backslash n S5:\n :S6:\n\nNote: Each of these 10 feature variable
s have been mean centered and scaled by the standard deviation times n_samples (i.e. the sum of squares of each column totals 1).\n\nSource URL:\nhttp://www4.stat.ncsu.edu/~boos/var.selec t/diabetes.html\n\nFor more information see:\nBradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) “Least Angle Regression,” Annals of Statistics (with discussion), 407-499.\n(http://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)\n’,
‘feature_names’:[‘age’,
′ s e x ′ 'sex' ′sex′
′ b m i ′ , 'bmi', ′bmi′,
‘bp’
′ s 1 ′ 's1' ′s1′
s 2 ′ , s2', s2′,
s 3 ′ , s3', s3′,
‘s4’,
s 5 ′ , s5', s5′,
s6’ \text {s6'} s6’
In [3]:
$\begin{array}{l}1\\ 2\end{array}\mathrm {X}=\text {diabetes}\left[\text {'data'}^{\prime }\right]$3 | $y=\text {diabetes}['$ $target']$ |
In [4]:
1 | from sklearn.model_selection import train_test_split |
In [5]: 1 X_train,> X t e s t X_test Xtest y_train, y t e s t = t r a i n t e s t s p l i t ( X , y , t e s t s i z e = 0.2 ) y_test=train_test_split(X,y,test_size=0.2) ytest=traintestsplit(X,y,testsize=0.2)
In [6]:
1 | X_train.shape |
Out[6]: (353,10)
In [7]: 1 X_train[:5]
Out[7]:array([[-0.08906294,-0.04464164,-0.01159501,-0.03665645,0.01219057,
0.02499059,-0.03603757, 0.03430886, 0.02269202,-0.00936191],
[0.02717829,0.05068012,-0.00620595,0.0287581,-0.01670444,
-0.00162703,-0.0581274,0.03430886,0.02930041,0.03205916],
[0.01628068, 0.05068012,-0.046085,0.01154374,-0.03321588,
-0.01603186,-0.01026611,-0.00259226,-0.0439854,-0.04249877],
[0.04170844,0.05068012,-0.01590626,0.01728186,-0.03734373,
-0.01383982,-0.02499266,-0.01107952,-0.04687948,0.01549073],
[-0.02367725,-0.04464164,-0.01590626,-0.01255635,0.02044629,
0.04127431,-0.04340085, 0.03430886,0.01407245,-0.00936191]])
: | 1 2 | #该数据有正有负,说明此数据被处理过,归一化 |
X_train.std(axis $=0$) |
Out[10]: array([0.04734822,0.0475412,0.04825435,0.04607366,0.04724651,
0.04758674,0.04927031,0.04779197,0.04657944,0.04707625])
In [11]:
1 | 1r = LinearRegression() |
In [12]: | 1 | lr.fit(X_train,$y_train)$ |
Out[12]: LinearRegression(copy X = True \mathrm {X}=\text {True} X=True ,fit_intercept = = = True, KaTeX parse error: Expected 'EOF', got '_' at position 9: \text {n_̲{jobs}=1} ,normali z e = F a l s e ) \mathrm {ze}=\mathrm {False}) ze=False)
In [16]:
1 | y_=1r.predict(X_test) |
2 | y_.round(2) |
Out[16]: array([85.12,163.71,63.55,218.3,163.06,150.55,82.88,193.32,
208.76,125.45,107.27,238.29,59.74,100.33,112.66,241.96,
187.74,188.74,148.74,199.43,209.59,229.64,160.79,149.22,
135.61,211.9,65.1,130.51,128.21,127.21,176.68,198.78,
217.68,74.84,122.66,130.45,99.47,155.85,71.78,82.48,
174.39,90.78,76.32,159.8,230.54,108.38,209.47,215.79,
198.95, 71.31,79.04,90.62,194.97,191.12,86.71,238.62,
162.68,196.31,210.25,78.46,147.33,201.89,291.03,215.39,
176.53, 58.49,122.49,192.83,175.76,130.75,249.97,152.78,
79.42,194.09,224.39,173.03,213.03,95.59,216.38,166.36,
81.86,93.82,209.76,116.94,115.04,144.28,256.24,179.02,
169.27])
In [14]: | 1 | y_test |
Out[14]: array([181.,179., 77.,295.,131.,202.,37.,257.,52.,139.,102.,
195.,65., 53.,97.,306.,78.,241.,25.,186.,221.,261.,
209.,100.,40.,288.,52.,92.,145.,150.,91.,265.,225.,
77.,84.,53.,54.,85.,55.,80.,262.,64.,200.,113.,
232.,97.,192.,275.,131.,92.,138.,49.,292.,232.,55.,
280.,220.,220.,265.,89.,150.,268.,270.,152.,77.,85.,
162.,123.,216.,131.,310.,246.,42.,68.,281.,141.,220.,
72.,163.,178.,51.,71.,121.,61.,160.,200.,336.,52.,
184.])
In [ ]:
1 | '''The coefficient R'2 is defined as (1 - u/v), where u is the residual |
2 | sum of squares ((y_true - y pred) ** 2).sum() and v is the total |
3 | sum of squares ((y_true-y_true.meansum of squares ((y_true-y_true.mean())** 2).sum() (). |
4 | The best possible score is 1.0 and it can be negative (because the |
5 | model can be arbitrarily worse). A constant model that always |
6 | predicts the expected value of y, disregarding the input features, |
7 | would get a R'2 score of 0.0.''' |
In [15]:
1 2 | #R2 决定系数 |
$lr.score(X_test,y_test)$ |
Out[15]: 0.5100017419052714
In [17]:
$1$ | *$((y_true-y_pred)*$ $($ $)$ $2).sum$ $($ $)$ $\mathrm {u}=\left(\left(\mathrm {y}_{-}\text {test}-\mathrm {y}_{-}\right)**2\right).$ $.sum$u |
Out[17]: 304838.15677376505
In [19]:
$1$ $2$3 | #$((y_true-y_true.mean$ $($ $)$ $**2).sum$ $($ $)$ $)$V $\mathrm{v}=\left(\left(\mathrm{y}_{-}\text{test}-\mathrm{y}_{-}\text{test.mean}$ $($ $)$ $\right)**\right.$ $*2)$ $.sum$ $($ $)$ |
Out[19]: 622120.8988764045
In [20]:
1 | 1-u/v |
Out[20]: 0.5100017419052714
In [21]:
1 | 1-np.var(y_test-y_)/np.var(y_test) |
Out[21]: 0.5108163438179167
In [22]:
1 | np.abs(y_test-y_).mean () |
Out[22]: 47.55520213877001
In [25]:
1 | from sklearn.metrics import mean_absolute_error,mean_squared_error, mean_squared_log_error |
In [24]: 1 mean_absolute_error(y_test,y_)
Out[24]:47.55520213877001