导入类库
1 from sklearn.externals import joblib 2 from sklearn.model_selection import train_test_split 3 from sklearn.datasets import load_boston 4 from sklearn.preprocessing import StandardScaler 5 from sklearn.linear_model import LinearRegression 6 from sklearn.metrics import r2_score 7 from sklearn import neighbors 8 import pandas as pd 9 import numpy as np 10 import seaborn as sns 11 import matplotlib.pyplot as plt 12 import sklearn.preprocessing as sp 13 import sklearn.pipeline as pl
小知识
1 # np.column_stack:取行排列数组 2 # np.vstack:取列排列数组 3 # a = np.array([1,2]) 4 # b = np.array([3,4]) 5 # print(np.vstack((a,b))) 6 # array([[1, 2],[3, 4]]) 7 # print(np.column_stack((a,b))) 8 # array([[1, 3],[2, 4]])
获取波士顿房价数据
1 # 获取波士顿房价数据 2 lb = load_boston() 3 # 将房价数据转换为每行包括影响因素和房价的DataFrame 4 df = pd.DataFrame(np.column_stack((lb.data, lb.target)), 5 columns=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 6 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']) 7 cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV'] 8 print(df) 9 print(df[cols])
多变量图
1 def pairplot_analyse(): 2 ''' 3 style:whitegrid-白色网格图 darkgrid-黑色网格图 ticks-散点图 dark white 4 context:notebook paper talk poster # size: paper < talk < poster < notebook 5 palette:调色板 6 kind:使用回归 7 diag_kind:改变对角图 8 markers:改变点形状 9 :return: 10 ''' 11 sns.set(style='dark', context='notebook') 12 sns.pairplot(df[cols], height=2, palette='husl', kind='reg', diag_kind='kde', markers='+') 13 plt.tight_layout() 14 plt.show()
热点图
1 def heatmap_analyse(): 2 ''' 3 cbar:柱子 4 annot:标记 5 square:方形 6 fmt:数据格式 7 yticklabels:y轴标签 8 xticklabels:x轴标签 9 :return: 10 ''' 11 # 计算皮尔逊相关系数 12 corr = np.corrcoef(df[cols].values.T) 13 # 生成热点图 14 hm = sns.heatmap(corr, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 15}, yticklabels=cols, 15 xticklabels=cols) 16 plt.show()
回归方法是一种对数值型连续随机变量进行预测和建模的监督学习算法。使用案例一般包括房价预测、股票走势或测试成绩等连续变化的案例;
回归任务的特点是标注的数据集具有数值型的目标变量。也就是说,每一个观察样本都有一个数值型的标注真值以监督算法。
线性回归
1 def bostn_linear(): 2 ''' 3 线性回归直接预测房子价格 4 :return: 5 ''' 6 7 # 获取数据 8 lb = load_boston() 9 10 # 分割数据集为训练集和测试集 test_size:分割比例 11 x_train, x_test, y_train, y_test = train_test_split(lb.data, lb.target, test_size=0.25) 12 13 # print(y_train, y_test) 14 # 特征值和目标值是都必须进行标准化处理,实例化两个标准化API 15 std_x = StandardScaler() 16 17 x_train = std_x.fit_transform(x_train) 18 # 用转化训练集的标准归一化测试集:上是fit_transform,下是transform 19 x_test = std_x.transform(x_test) 20 21 # 目标值 22 std_y = StandardScaler() 23 # -1表示自动识别行数 24 y_train = std_y.fit_transform(y_train.reshape(-1, 1)) 25 y_test = std_y.transform(y_test.reshape(-1, 1)) 26 27 # estimator 28 # 正规方程求解方式预测结果 29 # 创建线性回归对象 30 lr = LinearRegression() 31 # 训练数据 32 lr.fit(x_train, y_train) 33 print(lr.coef_) # 权值 34 35 # 保存训练好的模型 36 joblib.dump(lr, './test.pkl') 37 38 # 预测测试集的房子价格 39 # y_lr_predict = std_y.inverse_transform(lr.predict(x_test)) 40 orgin = std_y.inverse_transform(y_test[3]) # 转换成原格式 41 print('orgin value is:::::', orgin) 42 y_lr_predict = std_y.inverse_transform(lr.predict(np.array([x_test[3]]))) # predict参数是二维数组 43 44 print('正规方程测试集里面每个房子的预测价格:', y_lr_predict) 45 # print('正规方程R2评分:', r2_score(std_y.inverse_transform(y_test), y_lr_predict)) 46 # print('正规方程R2评分:', r2_score(orgin, y_lr_predict)) #r2_score,参数1:原测试数据,参数2:预测数据
原图像与预测图像对比
1 def log_fit(): 2 x = np.linspace(0, 20, 50) 3 y = x ** 3 + np.random.random(50, ) * 100 4 # pf = sp.PolynomialFeatures(3) 5 6 lr = LinearRegression() 7 # modle = pl.make_pipeline(pf,lr) 8 lr.fit(x.reshape(-1, 1), y) 9 x_predict = lr.predict(x.reshape(-1, 1)) 10 print(x_predict) 11 12 plt.scatter(x, y) # 曲线:原曲线 13 plt.scatter(x, x_predict, c='r') # 直线:预测曲线 14 plt.show()
房价预测实例
1 def test_fj(): 2 X = np.array([[500, 3, 0.3], [1000, 1, 0.6], [750, 2, 0.3], [600, 5, 0.2], [1200, 1, 0.6]], dtype=float) 3 Y = np.array([10000, 9000, 8000, 12000, 8500], dtype=float) 4 5 x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25) 6 print(x_train, x_test) 7 print('===================================================') 8 print(y_train, y_test) 9 10 std_x = StandardScaler() 11 x_train = std_x.fit_transform(x_train) 12 x_test = std_x.transform(x_test) 13 14 std_y = StandardScaler() 15 y_train = std_y.fit_transform(y_train.reshape(-1, 1)) 16 y_test = std_y.transform(y_test.reshape(-1, 1)) 17 18 lr = LinearRegression() 19 lr.fit(x_train, y_train) 20 print(lr.coef_) 21 22 # orign = std_y.inverse_transform(y_test[1]) 23 # print('orign is value:::::',orign) 24 # y_lr_predict = std_y.inverse_transform(lr.predict(np.array([x_test[1]]))) 25 y_lr_predict = std_y.inverse_transform(lr.predict(x_test)) 26 27 print('房价:', y_lr_predict) 28 print('评分:', r2_score(std_y.inverse_transform(y_test), y_lr_predict)) 29 30 31 def price_predict(): 32 # 数据有三个特征:距离地铁距离、附近小学、小区绿化率 33 X = np.array([[500, 3, 0.3], [1000, 1, 0.6], [750, 2, 0.3], [600, 5, 0.2], [1200, 1, 0.6]], dtype=float) 34 # 具有三个特征的房屋对应的房价 35 Y = np.array([10000, 9000, 8000, 12000, 8500], dtype=float) 36 37 std_x = StandardScaler() 38 x_train = std_x.fit_transform(X) 39 40 std_y = StandardScaler() 41 y_train = std_y.fit_transform(Y.reshape(-1, 1)) 42 # 构建线性预测模型 43 lr = LinearRegression() 44 # 模型在历史数据上进行训练,Y.reshape(-1,1)将Y变为二维数组,fit函数参数要求是二维数组 45 lr.fit(x_train, y_train.reshape(-1, 1)) 46 # 使用训练模型预测新房屋价格 47 distance = input('请输入新房屋距离地铁的距离:') 48 school = input('请输入附近小学数量:') 49 green = input('请输入小区绿化率:') 50 x_predict = std_x.transform(np.array([[distance, school, green]], dtype=float)) 51 print(std_y.inverse_transform(lr.predict(x_predict))) 52 # print(lr.predict(np.array([[distance, school, green]], dtype=float))) 53 # print(lr.predict(np.array([[1300, 3, 0.4]]))) 54 55 56 if __name__ == '__main__': 57 pairplot_analyse() 58 # heatmap_analyse() 59 # bostn_linear() 60 # log_fit() 61 # test_fj() 62 # price_predict() 63 pass
线性回归的几个特点:
1. 建模速度快,不需很复杂的计算,数据量大的情况下依然运行速度很快;
2. 可以根据系数给出每个变量的理解和解释 ;
3. 对异常值敏感。