# 17. 编码

``````>>> from sklearn import linear_model
>>> reg = linear_model.LinearRegression()
>>> reg.fit([[0,0],[1,1],[2,2]],[0,1,2])
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
>>> reg.coef_
array([ 0.5,  0.5])
``````

# 18. sklearn 中的年龄/净值回归

• `studentMain.py`
``````#!/usr/bin/python

import numpy
import matplotlib
matplotlib.use('agg')

import matplotlib.pyplot as plt
from studentRegression import studentReg
from class_vis import prettyPicture, output_image

from ages_net_worths import ageNetWorthData

ages_train, ages_test, net_worths_train, net_worths_test = ageNetWorthData()

reg = studentReg(ages_train, net_worths_train)

plt.clf()
plt.scatter(ages_train, net_worths_train, color="b", label="train data")
plt.scatter(ages_test, net_worths_test, color="r", label="test data")
plt.plot(ages_test, reg.predict(ages_test), color="black")
plt.legend(loc=2)
plt.xlabel("ages")
plt.ylabel("net worths")

plt.savefig("test.png")
output_image("test.png", "png", open("test.png", "rb").read())
``````
• `studentRegression.py`
``````def studentReg(ages_train, net_worths_train):
### import the sklearn regression module, create, and train your regression
### name your regression reg

### your code goes here!
from sklearn import linear_model
reg = linear_model.LinearRegression()

reg.fit(ages_train,net_worths_train)

return reg
``````
• 输出结果：

# 19. 通过sklearn提取信息

``````...
reg.fit(ages_train,net_worths_train)

# 根据训练后的回归模型，预测27岁人的收入
print reg.predict([27])

# 回归模型的斜率
print reg.coef_

# 回归模型的截距
print reg.intercept_

# 计算r平方分数（r-squared score）,最大值1，越大越精确
print reg.score(ages_test,net_worths_test)
``````

# 21. 现在你练习提取信息

• 从回归中提取预测、斜率和截距，以及训练和测试分数。
``````import numpy
import matplotlib.pyplot as plt

from ages_net_worths import ageNetWorthData

ages_train, ages_test, net_worths_train, net_worths_test = ageNetWorthData()

from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(ages_train, net_worths_train)

### get Katie's net worth (she's 27)
### sklearn predictions are returned in an array, so you'll want to index into
### the output to get what you want, e.g. net_worth = predict([[27]])[0][0] (not
### exact syntax, the point is the [0] at the end). In addition, make sure the
### argument to your prediction function is in the expected format - if you get
### a warning about needing a 2d array for your data, a list of lists will be
### interpreted by sklearn as such (e.g. [[27]]).
km_net_worth = reg.predict([27])[0][0] ### fill in the line of code to get the right value

### get the slope
### again, you'll get a 2-D array, so stick the [0][0] at the end
slope = reg.coef_[0][0] ### fill in the line of code to get the right value

### get the intercept
### here you get a 1-D array, so stick [0] on the end to access
### the info we want
intercept = reg.intercept_[0] ### fill in the line of code to get the right value

### get the score on test data
test_score = reg.score(ages_test,net_worths_test) ### fill in the line of code to get the right value

### get the score on the training data
training_score = reg.score(ages_train,net_worths_train) ### fill in the line of code to get the right value

def submitFit():
# all of the values in the returned dictionary are expected to be
# numbers for the purpose of the grader.
return {"networth":km_net_worth,
"slope":slope,
"intercept":intercept,
"stats on test":test_score,
"stats on training": training_score}
``````
• 输出
``````{"slope": 6.473549549577059, "stats on training": 0.8745882358217186, "intercept": -14.35378330775552, "stats on test": 0.812365729230847, "networth": 160.43205453082507}
``````

# 25. 最小化误差平方和

• ordinary least squares(OLS) 最小二乘法，sklearn使用该算法进行线性回归

• gradient descent 梯度下降法

# 31. 回归的 R 平方指标

``````# 计算r平方分数（r-squared score）,最大值1，越大越精确
print reg.score(ages_test,net_worths_test)
``````

# 35. 比较分类与回归

• 监督分类(朴素贝叶斯/SVM/决策树)

• 回归

# 37. 回归迷你项目

``````
#!/usr/bin/python

"""
Starter code for the regression mini-project.

Loads up/formats a modified version of the dataset
(why modified?  we've removed some trouble points
that you'll find yourself in the outliers mini-project).

Draws a little scatterplot of the training/testing data

You fill in the regression code where indicated:
"""

import sys
import pickle
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
dictionary = pickle.load( open("../final_project/final_project_dataset_modified.pkl", "r") )

### list the features you want to look at--first item in the
### list will be the "target" feature
features_list = ["bonus", "long_term_incentive"]
data = featureFormat( dictionary, features_list, remove_any_zeroes=True)
target, features = targetFeatureSplit( data )

### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"

### Your regression goes here!
### Please name it reg, so that the plotting code below picks it up and
### plots it correctly. Don't forget to change the test_color above from "b" to
### "r" to differentiate training points from test points.
from sklearn import linear_model

reg = linear_model.LinearRegression()
reg.fit(feature_train,target_train)
print "slope:",reg.coef_
print "intercept:",reg.intercept_
print "score train:",reg.score(feature_train,target_train)
print "score test:",reg.score(feature_test,target_test)

### draw the scatterplot, with color-coded training and testing points
import matplotlib.pyplot as plt
for feature, target in zip(feature_test, target_test):
plt.scatter( feature, target, color=test_color )
for feature, target in zip(feature_train, target_train):
plt.scatter( feature, target, color=train_color )

### labels for the legend
plt.scatter(feature_test[0], target_test[0], color=test_color, label="test")
plt.scatter(feature_test[0], target_test[0], color=train_color, label="train")

### draw the regression line, once it's coded
try:
plt.plot( feature_test, reg.predict(feature_test) )
except NameError:
pass
plt.xlabel(features_list[1])
plt.ylabel(features_list[0])
plt.legend()
plt.show()
``````
• 输出结果与图形

``````slope: [ 5.44814029]
intercept: -102360.543294
score train: 0.0455091926995
score test: -1.48499241737
``````

``````slope: [ 1.19214699]
intercept: 554478.756215
score train: 0.217085971258
score test: -0.59271289995
``````

# 47. 异常值破坏回归

reg.fit(feature_test, target_test) plt.plot(feature_train, reg.predict(feature_train), color=”b”)

``````reg.fit(feature_test, target_test)
plt.plot(feature_train, reg.predict(feature_train), color="y")
print "test slope:",reg.coef_
``````

• 蓝色斜线，使用训练数据进行回归训练，用测试数据预测
• 黄色斜线，使用测试数据进行回归训练，用训练数据预测
• 其中测试数据上有异常值，所以最终拟合的

``````train slope: [ 5.44814029]
train intercept: -102360.543294
train score train: 0.0455091926995
train score test: -1.48499241737
#############
test slope: [ 2.27410114]
test intercept: 124444.388866
test score train: -0.123597985403
test score test: 0.251488150398
``````