import numpy as np
import pandas as pd
import sklearn
import matplotlib as mlp
import seaborn as sns
#import re, pip, conda


"""
for package in [sklearn,mlp,np,pd,sns,pip,conda]:
    print(re.findall("([^']*)",str(package))[2],package.__version__)
"""

'\nfor package in [sklearn,mlp,np,pd,sns,pip,conda]:\n    print(re.findall("([^\']*)",str(package))[2],package.__version__)\n'


#pip install --upgrade scikit-learn
#conda update scikit-learn


#分类的情况：输出7个弱评估器上的分类结果（0，1，2）
r_clf = np.array([0,2,1,1,2,1,0])


b_result_clf = np.argmax(np.bincount(r_clf)) # 使用numpy自带函数解决少数服从多数问题


b_result_clf # 返回少数服从多数，集成算法在现在的样本上应该输出的类别

1


np.bincount(r_clf) # 0类2个，1类3个，2类2个

array([2, 3, 2], dtype=int64)


np.bincount([3,0,2,1,1,2,1,0]) #0类2个，1类3个，2类2个，3类1个

array([2, 3, 2, 1], dtype=int64)


np.argmax(np.array([2, 3, 2]))
#不难发现，其返回的就是少数服从多数后数量最多的类别

1


#如果是二分类，涉及到有一些负数类别的，可以使用如下代码
r_clf = np.array([1,1,1,-1,-1,-1,-1])


(r_clf == 1).sum() #整个集成算法当中，输出为1的弱分类器的数量

3


(r_clf == -1).sum()

4


b_result_clf = 1 if (r_clf == 1).sum() > (r_clf != 1).sum() else -1


b_result_clf

-1


#如果评估器的数量是偶数，而少数和多数刚好一致怎么办？
r_clf = np.array([1,1,1,0,0,0,2,2])


#回归的情况：输出7个弱评估器上的回归结果
r_reg = np.array([-2.082, -0.601, -1.686, -1.001, -2.037, 0.1284, 0.8500])


b_result_reg = r_reg.mean()


b_result_reg

-0.9183714285714285


import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor as RFR # 随机森林回归
from sklearn.tree import DecisionTreeRegressor as DTR     # 决策回归树
from sklearn.model_selection import cross_validate, KFold # 交叉验证类

#这里我们不再使用cross_val_score，转而使用能够输出训练集分数的cross_validate
#决策树本身就是非常容易过拟合的算法，而集成模型的参数量/复杂度很难支持大规模网格搜索
#因此对于随机森林来说，一定要关注算法的过拟合情况


data = pd.read_csv(r".\datasets\House Price\train_encode.csv",index_col=0)


data.head()


data.shape

(1460, 81)


X = data.iloc[:,:-1]
y = data.iloc[:,-1]


y #注意，y的类型是整数型，并且y的均值很大，可想而知整体的MSE一定会很大

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64


y.mean()

180921.19589041095


X.shape

(1460, 80)


X.columns.tolist()

['Id',
 '住宅类型',
 '住宅区域',
 '街道接触面积(英尺)',
 '住宅面积',
 '街道路面状况',
 '巷子路面状况',
 '住宅形状(大概)',
 '住宅现状',
 '水电气',
 '住宅配置',
 '住宅视野',
 '社区',
 '住宅周边1',
 '住宅周边2',
 '适用家庭',
 '住宅房型',
 '装修质量',
 '整体质量',
 '建造年份',
 '法律拆除年份',
 '天花板类型',
 '天花板材料',
 '户外装饰1',
 '户外装饰2',
 '砖墙类型',
 '砖墙面积',
 '户外材料质量',
 '户外装修质量',
 '地下室类型',
 '地下室深度',
 '地下室质量',
 '花园外墙',
 '地下室现状1',
 '地下室一层标准面积',
 '地下室现状2',
 '地下室二层标准面积',
 '地下室建造现状',
 '整体地下室面积',
 '暖气类型',
 '暖气质量',
 '中央空调',
 '电力系统',
 '二楼面积',
 '三楼面积',
 '全低质量面积',
 '户外活动空间面积',
 '全卫地下室',
 '半卫地下室',
 '全卫及以上',
 '半卫及以上',
 '卧室及以上',
 '厨房及以上',
 '厨房质量',
 '总房间量',
 '住宅性能',
 '壁炉数量',
 '壁炉质量',
 '车库类型',
 '车库建造时间',
 '车库装修现状',
 '车位数量',
 '车库面积',
 '车库质量',
 '车库现状',
 '石板路',
 '木板面积',
 '开放式门廊面积',
 '关闭式门廊面积',
 '三季门廊面积',
 '半开放式门廊面积',
 '泳池面积',
 '泳池质量',
 '篱笆质量',
 '其他配置',
 '其他配置的价值',
 '销售月份',
 '销售年份',
 '销售类型',
 '销售状态']


reg_f = RFR() #实例化随机森林
reg_t = DTR() #实例化决策树
cv = KFold(n_splits=5,shuffle=True,random_state=1412) #实例化交叉验证方式


result_t = cross_validate(reg_t                             # 要进行交叉验证的评估器
                          ,X,y                              # 数据
                          ,cv=cv                            # 交叉验证模式
                          ,scoring="neg_mean_squared_error" # 评估指标
                          ,return_train_score=True          # 是否返回训练集上交叉验证的分数
                          ,verbose=True                     # 是否打印进程
                          ,n_jobs=-1                        # 线程数，-1代表调用计算机中所有线程
                         )

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    9.9s remaining:   15.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.0s finished


result_f = cross_validate(reg_f,X,y,cv=cv,scoring="neg_mean_squared_error"
                          ,return_train_score=True
                          ,verbose=True
                          ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.1s remaining:    3.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.9s finished


result_t # 决策树上的交叉验证结果，超级过拟合；fit_time：拟合时间

{'fit_time': array([0.03649879, 0.04089069, 0.0289197 , 0.02692771, 0.0274806 ]),
 'score_time': array([0.00199342, 0.00200152, 0.00099802, 0.00199318, 0.00301957]),
 'test_score': array([-1.28805154e+09, -2.80215215e+09, -1.72389216e+09, -1.57753237e+09,
        -1.51032332e+09]),
 'train_score': array([-0., -0., -0., -0., -0.])}


result_f # 随机森林的交叉验证结果，训练集和测试在交叉验证上的分数差异更小，因此森林的过拟合程度没有决策树高

{'fit_time': array([2.00447917, 1.93865466, 1.97705007, 2.07168698, 2.06669641]),
 'score_time': array([0.01898646, 0.01994991, 0.01196265, 0.02293968, 0.02393651]),
 'test_score': array([-7.70912239e+08, -1.99743670e+09, -7.82037295e+08, -4.42877134e+08,
        -9.03940635e+08]),
 'train_score': array([-1.15456318e+08, -1.00117489e+08, -1.26789154e+08, -1.31821824e+08,
        -1.09692763e+08])}


trainRMSE_f = abs(result_f["train_score"])**0.5
testRMSE_f = abs(result_f["test_score"])**0.5
trainRMSE_t = abs(result_t["train_score"])**0.5
testRMSE_t = abs(result_t["test_score"])**0.5


trainRMSE_f.mean()

10793.160353127281


testRMSE_f.mean()

30306.635937508057


trainRMSE_f.std() #方差数额太大，使用标准差

532.1647041275475


# 交叉验证图像
# 默认值下随机森林的RMSE与标准差std
xaxis = range(1,6)
plt.figure(figsize=(8,6),dpi=80)
# RMSE
plt.plot(xaxis,trainRMSE_f,color="green",label = "RandomForestTrain")
plt.plot(xaxis,testRMSE_f,color="green",linestyle="--",label = "RandomForestTest")
plt.plot(xaxis,trainRMSE_t,color="orange",label = "DecisionTreeTrain")
plt.plot(xaxis,testRMSE_t,color="orange",linestyle="--",label = "DecisionTreeTest")
plt.xticks([1,2,3,4,5])
plt.xlabel("CVcounts",fontsize=16)
plt.ylabel("RMSE",fontsize=16)
plt.legend()
plt.show()


# 自定义一个RMSE函数
def RMSE(cvresult,key):
    return (abs(cvresult[key])**0.5).mean()


reg_f = RFR(n_estimators=3) # 建立随机森林，有3棵树
cv = KFold(n_splits=5,shuffle=True,random_state=1412) # 5折交叉验证
result_f = cross_validate(reg_f,X,y,cv=cv,scoring="neg_mean_squared_error"
                          ,return_train_score=True
                          ,verbose=True
                          ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished


RMSE(result_f,"test_score")

36918.85596354526


reg_f = RFR(n_estimators=100) # 建立随机森林，有100棵树
cv = KFold(n_splits=5,shuffle=True,random_state=1412)
result_f = cross_validate(reg_f,X,y,cv=cv,scoring="neg_mean_squared_error"
                          ,return_train_score=True
                          ,verbose=True
                          ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.3s remaining:    3.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.3s finished


RMSE(result_f,"test_score")

30376.08368636785


reg_f = RFR(n_estimators=500) # 建立随机森林，有500棵树
cv = KFold(n_splits=5,shuffle=True,random_state=1412)
result_f = cross_validate(reg_f,X,y,cv=cv,scoring="neg_mean_squared_error"
                          ,return_train_score=True
                          ,verbose=True
                          ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   11.8s remaining:   17.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   11.9s finished


RMSE(result_f,"test_score")

30323.956966236037


reg = RFR(n_estimators=20
          , bootstrap=True  # 进行随机抽样
          , oob_score=True  # 按袋外数据进行验证
          , max_samples=500 # 抽取500个样本
         ).fit(X,y)


#重要属性oob_score_
reg.oob_score_ # 在袋外数据上的R2为83%

0.8406980885927464


reg = RFR(n_estimators=20
          , bootstrap=False
          , oob_score=True
          , max_samples=500).fit(X,y) # 直接无法运行，因为要抽取袋外数据的前提是随机抽样

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_11668\3415431267.py in <module>
      2           , bootstrap=False
      3           , oob_score=True
----> 4           , max_samples=500).fit(X,y) # 直接无法运行，因为要抽取袋外数据的前提是随机抽样

~\anaconda3\envs\kaggle\lib\site-packages\sklearn\ensemble\_forest.py in fit(self, X, y, sample_weight)
    378         if not self.bootstrap and self.max_samples is not None:
    379             raise ValueError(
--> 380                 "`max_sample` cannot be set if `bootstrap=False`. "
    381                 "Either switch to `bootstrap=True` or set "
    382                 "`max_sample=None`."

ValueError: `max_sample` cannot be set if `bootstrap=False`. Either switch to `bootstrap=True` or set `max_sample=None`.


reg = RFR(n_estimators=20
          , bootstrap=True  # 允许抽样
          , oob_score=False # 但不进行计算
          , max_samples=500).fit(X,y)


reg.oob_score_ # 虽然可以训练，但oob_score无法被调用

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_11668\372648889.py in <module>
----> 1 reg.oob_score_ # 虽然可以训练，但oob_score无法被调用

AttributeError: 'RandomForestRegressor' object has no attribute 'oob_score_'


sqrt_ = []
log_ = []
for n_features in range(1,101,2):
    sqrt_.append(np.sqrt(n_features))
    log_.append(np.log2(n_features))    
xaxis = range(1,101,2)
plt.figure(figsize=(8,6),dpi=80)
#RMSE
plt.plot(xaxis,sqrt_,color="green",label = "sqrt(n)")
plt.plot(xaxis,log_,color="orange",label = "log2(n)")
plt.xticks(range(1,101,10))
plt.legend()
plt.show()


### 随机模式说明

random_state是一个较为抽象的参数，对于刚接触sklearn运作方式的人们来说，random_state可能会比较难以理解。

在日常生活中，当我们讨论“随机”时，我们指的是**真正的随机**：
- 不可预测
- 不可有目的地重复
- 实验之间完全相互独立

~~比如游戏当中抽卡~~，比如抛硬币，我们无法预料会抛出正面还是反面、并且抛出一次正面自后、无法有目的地再次抛出正面，无论上一次是正面还是反面，都不影响下一次抛硬币的时候的结果。**真正的随机一定伴随着物理实验，是只有自然界才有的现象**。

在计算机的世界中，任意的“随机”一定是通过某种计算方式得到和实现的，这种随机是“伪随机”。无论是随机抽样，还是随机打乱数据顺序等等，其实背后都有着计算机的规则在进行控制，**只要我们找到背后的规则，我们就可以有目的地重复随机的结果，而random_state就是在随机过程当中，控制随机规则的参数**。

例如，现在我们需要在[1,2,3,4,5]中随机抽取3个数字，我们可以有多种规则：
<br><br>
0号规则：4,5,1<br>
1号规则：2,1,5<br>
2号规则：1,5,4<br>
3号规则：2,5,4<br>
....

list_ = [1,2,3,4,5]

import random
random.sample(list_,k=3) #随机从列表中抽取样本，抽取3个

根据排列组合的规则，总共有$A_5^3$=60种选择。每当我们执行抽样的代码时，计算机会在所有规则中选择一个返回给我们。当我们多次执行抽样代码，计算机会返回不同的结果，营造一种“随机”的氛围，但并非真正的在列表中进行了“不可预测、不可重复”的抽样。

**只要我们设置随机数种子，我们就可以重复抽样的结果，令本来应该“随机”的过程变成固定过程**。

#random.seed是random模块中的随机数种子，等同于sklearn中的random_state
random.seed(0) #0号规则
random.sample(list_,k=3)

random.seed(2)
random.sample(list_,k=3)

即便只有60种选择，我们的随机数种子也可以设置为任意的数字。当然，无论我们设置怎样的数字，最终计算机都会从这60中选择中挑选一个返回给我们，这也是计算机的规则决定的。无论我们输入了什么数字，只要我们认为当下随机操作返回的结果是可以接受的，就可以持续使用我们设置的数字。

random.seed(1412)
random.sample(list_,k=3)

random.seed(2333)
random.sample(list_,k=3)

  File "C:\Users\zhiyuan\AppData\Local\Temp\ipykernel_11668\568528997.py", line 3
    random_state是一个较为抽象的参数，对于刚接触sklearn运作方式的人们来说，random_state可能会比较难以理解。
                                                                      ^
SyntaxError: invalid character in identifier


import pandas as pd
import random

list_ = [1,2,3,4,5]
list_p = pd.Series(list_)

list_p

0    1
1    2
2    3
3    4
4    5
dtype: int64


#random中的随机抽样
random.seed(1)
random.sample(list_,k=3)

[2, 1, 5]


#pandas中的随机抽样
list_p.sample(n=3,random_state=1).values

array([3, 2, 5], dtype=int64)


#参数潜在取值，由于现在我们只调整一个参数，因此参数的范围可以取大一些、取值也可以更密集
Option = [1,*range(5,101,5)]


#生成保存模型结果的arrays
trainRMSE = np.array([])
testRMSE = np.array([])
trainSTD = np.array([])
testSTD = np.array([])

#在参数取值中进行循环
for n_estimators in Option:
    
    #按照当下的参数，实例化模型
    reg_f = RFR(n_estimators=n_estimators,random_state=1412)
    
    #实例化交叉验证方式，输出交叉验证结果
    cv = KFold(n_splits=5,shuffle=True,random_state=1412)
    result_f = cross_validate(reg_f,X,y,cv=cv,scoring="neg_mean_squared_error"
                              ,return_train_score=True
                              ,n_jobs=-1)
    
    #根据输出的MSE进行RMSE计算
    train = abs(result_f["train_score"])**0.5
    test = abs(result_f["test_score"])**0.5
    
    #将本次交叉验证中RMSE的均值、标准差添加到arrays中进行保存
    trainRMSE = np.append(trainRMSE,train.mean()) # 均值越高表示效果越好
    testRMSE = np.append(testRMSE,test.mean())
    trainSTD = np.append(trainSTD,train.std())    #标准差越低表示模型越稳定
    testSTD = np.append(testSTD,test.std())


def plotCVresult(Option,trainRMSE,testRMSE,trainSTD,testSTD):
    #一次交叉验证下，RMSE的均值与std的绘图
    
    xaxis = Option
    plt.figure(figsize=(8,6),dpi=80)

    #RMSE
    plt.plot(xaxis,trainRMSE,color="k",label = "RandomForestTrain")
    plt.plot(xaxis,testRMSE,color="red",label = "RandomForestTest")

    #标准差 - 围绕在RMSE旁形成一个区间
    plt.plot(xaxis,trainRMSE+trainSTD,color="k",linestyle="dotted")
    plt.plot(xaxis,trainRMSE-trainSTD,color="k",linestyle="dotted")
    plt.plot(xaxis,testRMSE+testSTD,color="red",linestyle="dotted")
    plt.plot(xaxis,testRMSE-testSTD,color="red",linestyle="dotted")
    plt.xticks([*xaxis])
    plt.legend(loc=1)
    plt.show()


plotCVresult(Option,trainRMSE,testRMSE,trainSTD,testSTD)


reg_f = RFR(n_estimators=10,random_state=1412)
reg_f = reg_f.fit(X,y) #训练一个随机森林


reg_f.estimators_ # 一片随机森林中所有的树

[DecisionTreeRegressor(max_features='auto', random_state=1630984966),
 DecisionTreeRegressor(max_features='auto', random_state=472863509),
 DecisionTreeRegressor(max_features='auto', random_state=1082704530),
 DecisionTreeRegressor(max_features='auto', random_state=1930362544),
 DecisionTreeRegressor(max_features='auto', random_state=273973624),
 DecisionTreeRegressor(max_features='auto', random_state=21991934),
 DecisionTreeRegressor(max_features='auto', random_state=1886585710),
 DecisionTreeRegressor(max_features='auto', random_state=63725675),
 DecisionTreeRegressor(max_features='auto', random_state=1374343434),
 DecisionTreeRegressor(max_features='auto', random_state=1078007175)]


# 可以用索引单独提取一棵树
reg_f.estimators_[0]

DecisionTreeRegressor(max_features='auto', random_state=1630984966)


# 调用这棵树的底层结构
reg_f.estimators_[0].tree_

<sklearn.tree._tree.Tree at 0x2338d8ca340>


reg_f.estimators_[0].tree_.max_depth # max_depth=None

19


# 对森林中所有树查看实际深度
for t in reg_f.estimators_:
    print(t.tree_.max_depth)

19
25
27
20
23
22
22
20
22
24


# 如果树的数量较多，也可以查看平均或分布
reg_f = RFR(n_estimators=100,random_state=1412)
reg_f = reg_f.fit(X,y) # 训练一个随机森林
d = pd.Series([],dtype="int64")
for idx,t in enumerate(reg_f.estimators_):
    d[idx] = t.tree_.max_depth


d.mean()

22.25


d.describe()

count    100.000000
mean      22.250000
std        1.955954
min       19.000000
25%       21.000000
50%       22.000000
75%       23.000000
max       30.000000
dtype: float64


#一棵树上的总叶子量
reg_f.estimators_[0].tree_.node_count

1807


#所有树上的总叶子量
for t in reg_f.estimators_:
    print(t.tree_.node_count)

1807
1777
1763
1821
1777
1781
1811
1771
1753
1779
1765
1769
1799
1755
1745
1781
1733
1733
1775
1775
1791
1743
1777
1759
1779
1765
1801
1791
1755
1779
1803
1795
1795
1811
1763
1761
1733
1759
1745
1791
1789
1791
1767
1813
1753
1765
1793
1729
1827
1803
1741
1779
1737
1791
1773
1787
1749
1779
1759
1801
1775
1787
1751
1779
1743
1751
1727
1773
1825
1747
1763
1801
1803
1795
1755
1737
1799
1735
1775
1781
1755
1801
1811
1767
1787
1771
1775
1755
1753
1801
1775
1817
1765
1773
1715
1757
1751
1791
1773
1769


#每个节点上的不纯度下降量，为-2则表示该节点是叶子节点
reg_f.estimators_[0].tree_.threshold

array([  6.5,   5.5, 327. , ...,  -2. ,  -2. ,  -2. ])


#每个节点上的不纯度下降量，为-2则表示该节点是叶子节点
reg_f.estimators_[0].tree_.threshold.tolist()[:20]

[6.5,
 5.5,
 327.0,
 214.0,
 0.5,
 1.0,
 104.0,
 0.5,
 -2.0,
 -2.0,
 -2.0,
 105.5,
 28.5,
 0.5,
 1.5,
 -2.0,
 -2.0,
 11.0,
 1212.5,
 2.5]


# 你怎么知道min_impurity_decrease的范围设置多少会剪掉多少叶子？
# 接下来统计下每个不纯度下降量对应多少个结点
pd.Series(reg_f.estimators_[0].tree_.threshold).value_counts().sort_index()

-2.0       904
 0.5        43
 1.0        32
 1.5        56
 2.0        32
          ... 
 1118.5      1
 1162.5      1
 1212.5      2
 1254.5      1
 1335.5      1
Length: 413, dtype: int64


pd.set_option("display.max_rows",None)
# cumsum是累加函数，Ps：有点像一分一段表的意思
np.cumsum(pd.Series(reg_f.estimators_[0].tree_.threshold).value_counts().sort_index()[1:]) # [1:]：去掉叶子结点

1.0        32
1.5        88
2.0       120
2.5       167
3.0       189
3.5       208
4.0       224
4.5       249
5.0       258
5.5       271
6.0       276
6.5       287
7.0       302
7.5       307
8.0       313
8.5       321
9.0       326
9.5       334
10.0      335
10.5      343
11.0      346
11.5      348
12.0      349
12.5      353
13.0      355
13.5      359
14.0      360
14.5      361
15.5      364
16.0      366
17.5      368
18.0      370
18.5      372
19.0      374
19.5      377
20.0      378
20.5      379
21.0      382
22.0      385
22.5      387
23.0      389
24.0      391
24.5      392
26.0      394
26.5      396
27.0      398
27.5      399
28.5      400
29.5      401
30.0      402
31.0      404
31.5      405
32.0      407
32.5      408
33.0      409
34.0      410
34.5      411
35.0      413
35.5      416
36.0      417
36.5      419
37.0      420
37.5      422
38.0      424
39.5      427
40.0      428
40.5      429
41.0      433
42.0      435
43.0      436
43.5      439
44.0      440
44.5      443
45.0      444
45.5      450
46.0      452
46.5      454
47.0      456
48.0      459
48.5      463
49.0      465
49.5      467
50.0      470
50.5      471
51.0      474
51.5      477
52.5      478
53.0      479
53.5      481
54.0      482
54.5      483
55.0      486
55.5      487
56.0      489
56.5      493
57.0      495
58.0      497
58.5      500
59.5      501
60.0      503
60.5      505
61.0      509
62.5      511
63.0      513
63.5      515
64.5      516
65.0      517
65.5      518
66.5      519
67.0      522
67.5      523
68.5      525
69.0      526
69.5      527
71.0      528
72.0      529
73.0      530
74.0      531
75.0      532
75.5      533
76.0      534
77.0      536
78.5      537
79.5      539
81.5      540
82.5      542
84.0      543
84.5      545
85.5      546
87.0      548
87.5      549
88.0      550
88.5      551
90.0      552
90.5      553
91.0      555
91.5      556
92.0      558
92.5      560
94.5      561
95.0      562
95.5      564
96.0      566
96.5      567
97.0      569
98.0      570
99.5      571
102.0     572
103.5     573
104.0     574
104.5     575
105.5     577
106.5     579
107.0     580
107.5     581
108.0     583
110.0     584
110.5     585
112.0     586
113.0     587
114.5     588
116.5     589
117.5     590
118.0     591
120.5     592
122.0     593
124.5     594
127.0     596
127.5     597
128.5     598
129.5     599
130.5     600
134.0     601
135.5     602
136.5     603
137.0     604
139.0     605
139.5     606
142.0     607
143.0     608
146.0     609
146.5     610
147.0     611
148.0     612
148.5     613
149.0     614
150.5     616
151.5     617
153.0     618
160.5     619
162.0     620
163.0     621
164.5     623
170.5     624
171.0     625
172.5     626
175.0     627
176.5     628
177.0     629
178.0     630
179.0     632
179.5     633
188.0     634
190.5     635
191.5     636
192.5     638
194.5     639
195.0     640
198.5     641
199.0     642
201.0     644
202.5     645
204.0     647
207.0     648
209.0     649
209.5     650
210.0     651
210.5     652
214.0     653
215.0     654
216.5     655
217.0     656
218.5     657
223.0     658
227.5     659
228.5     660
232.0     661
233.5     662
237.0     664
238.0     665
239.5     666
242.0     667
242.5     668
243.5     669
246.0     671
248.5     672
250.0     673
251.5     674
252.0     675
256.0     677
259.5     678
260.5     679
263.0     680
264.0     681
265.5     682
268.5     684
278.0     685
281.5     686
284.0     687
285.0     688
288.5     689
289.5     691
290.5     692
291.0     693
293.0     694
295.0     695
296.5     696
297.0     697
298.0     698
299.0     699
300.0     701
301.5     702
302.0     703
303.0     704
303.5     705
305.0     706
305.5     707
306.0     708
306.5     709
319.5     711
323.5     712
327.0     713
327.5     715
328.0     717
334.5     718
338.5     719
344.0     721
345.0     722
346.0     723
347.0     724
350.5     725
351.0     726
354.0     727
357.0     728
364.5     729
365.5     730
368.0     731
369.5     732
374.5     733
387.0     734
392.5     735
395.0     736
395.5     737
397.0     738
404.0     739
406.5     740
411.0     741
412.0     742
412.5     743
413.5     744
416.0     745
417.5     746
420.0     747
420.5     748
432.0     749
446.5     750
448.5     751
452.0     752
455.0     753
473.5     754
474.5     755
475.0     756
481.0     757
485.0     758
487.5     759
488.5     760
490.0     761
492.5     762
493.0     763
493.5     764
500.5     765
501.0     766
501.5     767
505.0     768
510.0     769
517.5     770
520.0     771
522.0     772
527.5     775
531.5     776
532.0     777
535.5     778
540.0     780
542.5     781
543.5     782
547.5     783
550.5     784
553.5     785
556.0     786
556.5     787
559.5     788
560.0     789
563.0     790
574.5     791
577.0     792
579.0     793
580.0     794
582.0     795
582.5     796
587.0     797
591.0     798
593.0     799
594.5     800
596.5     801
609.0     802
610.0     803
611.5     804
615.0     805
619.0     806
630.5     807
631.0     808
647.0     810
655.5     811
656.5     812
661.5     813
662.0     814
668.5     815
673.5     816
680.0     817
681.5     818
690.5     819
695.5     820
697.5     821
705.5     822
717.0     823
726.5     824
737.5     825
739.5     826
759.5     827
760.5     828
767.5     829
769.0     830
794.5     831
807.0     832
810.5     833
819.5     834
820.5     835
843.0     836
847.0     837
865.0     838
874.0     839
877.5     840
894.5     841
903.0     842
917.5     843
918.0     844
924.0     845
954.0     846
961.5     847
1013.5    848
1015.5    849
1024.5    850
1043.5    851
1052.5    852
1059.5    853
1070.0    854
1118.5    855
1162.5    856
1212.5    858
1254.5    859
1335.5    860
dtype: int64


# min_sample_split的范围要如何设置才会剪掉很多叶子？
np.bincount(reg_f.estimators_[0].tree_.n_node_samples.tolist())[:10]

array([  0, 879, 321, 154,  86,  52,  42,  38,  29,  18], dtype=int64)


from sklearn.tree._tree import Tree


type(Tree)

type


help(Tree)

Help on class Tree in module sklearn.tree._tree:

class Tree(builtins.object)
 |  Array-based representation of a binary decision tree.
 |  
 |  The binary tree is represented as a number of parallel arrays. The i-th
 |  element of each array holds information about the node `i`. Node 0 is the
 |  tree's root. You can find a detailed description of all arrays in
 |  `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split
 |  nodes, resp. In this case the values of nodes of the other type are
 |  arbitrary!
 |  
 |  Attributes
 |  ----------
 |  node_count : int
 |      The number of nodes (internal nodes + leaves) in the tree.
 |  
 |  capacity : int
 |      The current capacity (i.e., size) of the arrays, which is at least as
 |      great as `node_count`.
 |  
 |  max_depth : int
 |      The depth of the tree, i.e. the maximum depth of its leaves.
 |  
 |  children_left : array of int, shape [node_count]
 |      children_left[i] holds the node id of the left child of node i.
 |      For leaves, children_left[i] == TREE_LEAF. Otherwise,
 |      children_left[i] > i. This child handles the case where
 |      X[:, feature[i]] <= threshold[i].
 |  
 |  children_right : array of int, shape [node_count]
 |      children_right[i] holds the node id of the right child of node i.
 |      For leaves, children_right[i] == TREE_LEAF. Otherwise,
 |      children_right[i] > i. This child handles the case where
 |      X[:, feature[i]] > threshold[i].
 |  
 |  feature : array of int, shape [node_count]
 |      feature[i] holds the feature to split on, for the internal node i.
 |  
 |  threshold : array of double, shape [node_count]
 |      threshold[i] holds the threshold for the internal node i.
 |  
 |  value : array of double, shape [node_count, n_outputs, max_n_classes]
 |      Contains the constant prediction value of each node.
 |  
 |  impurity : array of double, shape [node_count]
 |      impurity[i] holds the impurity (i.e., the value of the splitting
 |      criterion) at node i.
 |  
 |  n_node_samples : array of int, shape [node_count]
 |      n_node_samples[i] holds the number of training samples reaching node i.
 |  
 |  weighted_n_node_samples : array of int, shape [node_count]
 |      weighted_n_node_samples[i] holds the weighted number of training samples
 |      reaching node i.
 |  
 |  Methods defined here:
 |  
 |  __getstate__(...)
 |      Getstate re-implementation, for pickling.
 |  
 |  __reduce__(...)
 |      Reduce re-implementation, for pickling.
 |  
 |  __setstate__(...)
 |      Setstate re-implementation, for unpickling.
 |  
 |  apply(...)
 |      Finds the terminal region (=leaf node) for each sample in X.
 |  
 |  compute_feature_importances(...)
 |      Computes the importance of each feature (aka variable).
 |  
 |  compute_partial_dependence(...)
 |      Partial dependence of the response on the ``target_feature`` set.
 |      
 |      For each sample in ``X`` a tree traversal is performed.
 |      Each traversal starts from the root with weight 1.0.
 |      
 |      At each non-leaf node that splits on a target feature, either
 |      the left child or the right child is visited based on the feature
 |      value of the current sample, and the weight is not modified.
 |      At each non-leaf node that splits on a complementary feature,
 |      both children are visited and the weight is multiplied by the fraction
 |      of training samples which went to each child.
 |      
 |      At each leaf, the value of the node is multiplied by the current
 |      weight (weights sum to 1 for all visited terminal nodes).
 |      
 |      Parameters
 |      ----------
 |      X : view on 2d ndarray, shape (n_samples, n_target_features)
 |          The grid points on which the partial dependence should be
 |          evaluated.
 |      target_features : view on 1d ndarray, shape (n_target_features)
 |          The set of target features for which the partial dependence
 |          should be evaluated.
 |      out : view on 1d ndarray, shape (n_samples)
 |          The value of the partial dependence function on each grid
 |          point.
 |  
 |  decision_path(...)
 |      Finds the decision path (=node) for each sample in X.
 |  
 |  predict(...)
 |      Predict target for X.
 |  
 |  ----------------------------------------------------------------------
 |  Static methods defined here:
 |  
 |  __new__(*args, **kwargs) from builtins.type
 |      Create and return a new object.  See help(type) for accurate signature.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  capacity
 |  
 |  children_left
 |  
 |  children_right
 |  
 |  feature
 |  
 |  impurity
 |  
 |  max_depth
 |  
 |  max_n_classes
 |  
 |  n_classes
 |  
 |  n_features
 |  
 |  n_leaves
 |  
 |  n_node_samples
 |  
 |  n_outputs
 |  
 |  node_count
 |  
 |  threshold
 |  
 |  value
 |  
 |  weighted_n_node_samples
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  __pyx_vtable__ = <capsule object NULL>


import numpy as np
import pandas as pd
import sklearn
import matplotlib as mlp
import matplotlib.pyplot as plt
import time #计时模块time
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import cross_validate, KFold, GridSearchCV


def RMSE(cvresult,key):
    return (abs(cvresult[key])**0.5).mean()


data = pd.read_csv(r".\datasets\House Price\train_encode.csv",index_col=0)


X = data.iloc[:,:-1]
y = data.iloc[:,-1]


X.shape

(1460, 80)


X.head()


reg = RFR(random_state=1412)
cv = KFold(n_splits=5,shuffle=True,random_state=1412)


result_pre_adjusted = cross_validate(reg,X,y,cv=cv,scoring="neg_mean_squared_error"
                          ,return_train_score=True
                          ,verbose=True
                          ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.1s finished


RMSE(result_pre_adjusted,"train_score") # 训练集的平均RMSE

11177.272008319653


RMSE(result_pre_adjusted,"test_score") # 测试集的平均RMSE

30571.26665524217


param_grid_simple = {"criterion": ["squared_error","poisson"]
                     , 'n_estimators': [*range(20,100,5)]              # 从20到100，每5个数字选一个
                     , 'max_depth': [*range(10,25,2)]
                     , "max_features": ["log2","sqrt",16,32,64,"auto"] # 位于开根号和不开根号之间的数 16,32,64
                     , "min_impurity_decrease": [*np.arange(0,5,10)]
                    }


# 建议根据计算机能力，n_jobs设置为4或8，避免占用过多资源导致计算机卡顿；verbose=True可以看到建树的过程
reg = RFR(random_state=1412,verbose=True,n_jobs=-1)
cv = KFold(n_splits=5,shuffle=True,random_state=1412)
search = GridSearchCV(estimator=reg
                     ,param_grid=param_grid_simple
                     ,scoring = "neg_mean_squared_error"
                     ,verbose = True
                     ,cv = cv
                     ,n_jobs=-1)


#=====【TIME WARNING: 7mins】=====#
start = time.time()
search.fit(X,y)
print(time.time() - start)

Fitting 5 folds for each of 1536 candidates, totalling 7680 fits
381.6039867401123

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  85 out of  85 | elapsed:    0.0s finished


search.best_estimator_

RandomForestRegressor(max_depth=23, max_features=16, min_impurity_decrease=0,
                      n_estimators=85, n_jobs=-1, random_state=1412,
                      verbose=True)


# 以上参数在测试集中的分数
abs(search.best_score_)**0.5

29179.698261599166


ad_reg = RFR(n_estimators=85, max_depth=23, max_features=16, random_state=1412)


cv = KFold(n_splits=5,shuffle=True,random_state=1412)
result_post_adjusted = cross_validate(ad_reg,X,y,cv=cv,scoring="neg_mean_squared_error"
                          ,return_train_score=True
                          ,verbose=True
                          ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished


# 训练集上的结果
RMSE(result_post_adjusted,"train_score")

11000.81099038192


# 测试集上的结果
RMSE(result_post_adjusted,"test_score")

28572.070208366855


#默认值下随机森林的RMSE
xaxis = range(1,6)
plt.figure(figsize=(8,6),dpi=80)
#RMSE
plt.plot(xaxis,abs(result_pre_adjusted["train_score"])**0.5,color="green",label = "RF_pre_ad_Train")
plt.plot(xaxis,abs(result_pre_adjusted["test_score"])**0.5,color="green",linestyle="--",label = "RF_pre_ad_Test")
plt.plot(xaxis,abs(result_post_adjusted["train_score"])**0.5,color="orange",label = "RF_post_ad_Train")
plt.plot(xaxis,abs(result_post_adjusted["test_score"])**0.5,color="orange",linestyle="--",label = "RF_post_ad_Test")
plt.xticks([1,2,3,4,5])
plt.xlabel("CVcounts",fontsize=16)
plt.ylabel("RMSE",fontsize=16)
plt.legend()
plt.show()


X.shape

(1460, 80)


y.shape

(1460,)


from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error


X_fc = fetch_california_housing().data    # 取出数据矩阵
y_fc = fetch_california_housing().target  # 取出标签


X_fc

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])


X_fc.shape # 可以看到，加利福尼亚房价数据集的特征量为8

(20640, 8)


model = RFR(n_estimators=3, warm_start=False) #不支持增量学习的


model1 = model.fit(X_fc,y_fc)


# RMSE
(mean_squared_error(y_fc,model1.predict(X_fc)))**0.5

0.2995901715086445


# 使用.estimators_查看森林中所有树的情况，可以看到每一棵树的随机数种子
model1.estimators_

[DecisionTreeRegressor(max_features='auto', random_state=202294224),
 DecisionTreeRegressor(max_features='auto', random_state=1414478702),
 DecisionTreeRegressor(max_features='auto', random_state=2081316624)]


model1 = model1.fit(X.iloc[:,:8],y)
# 注意，X有80个特征，X_fc只有8个特征，输入同一个模型的数据必须结构一致


model1.estimators_ # 你发现了吗？model1中原始的树消失了，新的树替代了原始的树

[DecisionTreeRegressor(max_features='auto', random_state=1958478857),
 DecisionTreeRegressor(max_features='auto', random_state=1676985416),
 DecisionTreeRegressor(max_features='auto', random_state=212040939)]


#RMSE
(mean_squared_error(y_fc,model1.predict(X_fc)))**0.5

C:\Users\zhiyuan\anaconda3\envs\kaggle\lib\site-packages\sklearn\base.py:451: UserWarning: X does not have valid feature names, but RandomForestRegressor was fitted with feature names
  "X does not have valid feature names, but"

220052.01202180694


model = RFR(n_estimators=3, warm_start=True) # warm_start=True：支持增量学习


model2 = model.fit(X_fc,y_fc)


(mean_squared_error(y_fc,model2.predict(X_fc)))**0.5

0.30011650693150316


model2.estimators_

[DecisionTreeRegressor(max_features='auto', random_state=1878468972),
 DecisionTreeRegressor(max_features='auto', random_state=92910986),
 DecisionTreeRegressor(max_features='auto', random_state=1405695826)]


model2 = model2.fit(X.iloc[:,:8],y) # 取8个特征

C:\Users\zhiyuan\anaconda3\envs\kaggle\lib\site-packages\sklearn\ensemble\_forest.py:430: UserWarning: Warm-start fitting without increasing n_estimators does not fit new trees.
  "Warm-start fitting without increasing n_estimators does not "


(mean_squared_error(y_fc,model2.predict(X_fc)))**0.5

C:\Users\zhiyuan\anaconda3\envs\kaggle\lib\site-packages\sklearn\base.py:451: UserWarning: X does not have valid feature names, but RandomForestRegressor was fitted with feature names
  "X does not have valid feature names, but"

0.30011650693150316


model2.estimators_ # 在增量学习当中，树没有发生变化

[DecisionTreeRegressor(max_features='auto', random_state=1878468972),
 DecisionTreeRegressor(max_features='auto', random_state=92910986),
 DecisionTreeRegressor(max_features='auto', random_state=1405695826)]


model2.estimators_ # 属性，反映训练完毕的模型的一些特点、一些客观存在的性质

[DecisionTreeRegressor(max_features='auto', random_state=1878468972),
 DecisionTreeRegressor(max_features='auto', random_state=92910986),
 DecisionTreeRegressor(max_features='auto', random_state=1405695826)]


# 调用模型的参数，可以通过这种方式修改模型的参数，而不需要重新实例化模型
model2.n_estimators += 2 # 增加2棵树，用于增量学习


model2

RandomForestRegressor(n_estimators=5, warm_start=True)


model2.fit(X.iloc[:,:8],y) # 训练

RandomForestRegressor(n_estimators=5, warm_start=True)


model2.estimators_ # 会发现原来的树还是没有变化，新增的树是基于新输入的数据进行训练的

[DecisionTreeRegressor(max_features='auto', random_state=1878468972),
 DecisionTreeRegressor(max_features='auto', random_state=92910986),
 DecisionTreeRegressor(max_features='auto', random_state=1405695826),
 DecisionTreeRegressor(max_features='auto', random_state=1240115137),
 DecisionTreeRegressor(max_features='auto', random_state=1531900695)]


trainpath = r".\datasets\Big data\bigdata_train.csv"
testpath = r".\datasets\Big data\bigdata_test.csv"


# 使用deque与StringIO辅助，导入csv文件最后的n行
from collections import deque # deque：双向队列
from io import StringIO


with open(trainpath, 'r') as data:
    q = deque(data, 5)


q # 最后5行的数据

deque(['995029,3.0,3.0,5.0,5.0,2.0,3.0,2.0,5.0,5.0,5.0,2.0,2.0,4.0,4.0,1.0,1.0,2.0,4.0,4.0,2.0,4.0,3.0,1.0,4.0,1.0,4.0,2.0,4.0,4.0,5.0,4.0,3.0,4.0,3.0,3.0,4.0,4.0,1.0,2.0,5.0,3.0,3.0,3.0,1.0,3.0,4.0,5.0,2.0,5.0,3.0,82719.0,5474.0,7131.0,27265.0,12898.0,18537.0,13712.0,9704.0,9312.0,10824.0,17332.0,25771.0,21437.0,39362.0,29041.0,16015.0,12711.0,12114.0,11141.0,10610.0,34767.0,23585.0,2453.0,23004.0,4677.0,31609.0,11498.0,24396.0,8758.0,11288.0,18892.0,31976.0,5874.0,23840.0,38838.0,13131.0,8298.0,15644.0,7292.0,8649.0,8513.0,18259.0,34832.0,3168.0,3306.0,17459.0,12079.0,9565.0,6310.0,24019.0,291658.0,666.0,469.0,37.0,1954.0,33.0,0.0,41.0,865.0,-70.6503\n',
       '995030,2.0,4.0,4.0,2.0,4.0,2.0,4.0,4.0,4.0,4.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,0.0,2.0,4.0,2.0,4.0,1.0,4.0,4.0,4.0,5.0,2.0,5.0,1.0,4.0,1.0,5.0,1.0,5.0,4.0,4.0,1.0,5.0,2.0,5.0,1.0,4.0,0.0,4.0,4.0,5470.0,2913.0,2137.0,2967.0,2060.0,2499.0,4645.0,7550.0,4682.0,5440.0,6976.0,5326.0,1556.0,4765.0,4029.0,2760.0,4450.0,5211.0,1623.0,4274.0,2652.0,14.0,5820.0,4383.0,9358.0,1826.0,4699.0,2441.0,5734.0,5116.0,1797.0,4038.0,2386.0,6465.0,3693.0,3399.0,5281.0,3727.0,1413.0,9482.0,2911.0,3693.0,2766.0,3782.0,1958.0,4180.0,7876.0,14.0,5032.0,2003.0,968800.0,666.0,469.0,6.0,208.0,30.0,0.0,208.0,19838.0,-123.0867\n',
       '995031,2.0,1.0,3.0,2.0,5.0,1.0,5.0,4.0,4.0,3.0,2.0,5.0,3.0,4.0,1.0,1.0,1.0,3.0,2.0,2.0,3.0,4.0,1.0,4.0,3.0,5.0,1.0,3.0,5.0,5.0,4.0,4.0,5.0,2.0,4.0,3.0,4.0,1.0,3.0,4.0,3.0,2.0,4.0,2.0,3.0,2.0,3.0,1.0,4.0,4.0,74490.0,6721.0,19705.0,8894.0,11176.0,27505.0,12282.0,12209.0,7985.0,7434.0,34860.0,9848.0,9200.0,13043.0,995.0,5621.0,3592.0,4657.0,7069.0,5196.0,40330.0,6027.0,6590.0,5617.0,6727.0,14579.0,7065.0,10650.0,1350.0,8252.0,33374.0,7564.0,4720.0,14695.0,15016.0,9359.0,9410.0,53991.0,14756.0,5976.0,31580.0,6783.0,24779.0,41707.0,8803.0,8412.0,4472.0,7805.0,5635.0,5801.0,567037.0,93.0,541.0,596.0,2892.0,1602.0,0.0,144.0,2745.0,112.5\n',
       '995032,1.0,4.0,1.0,5.0,2.0,2.0,1.0,5.0,2.0,4.0,5.0,1.0,4.0,2.0,4.0,4.0,3.0,4.0,5.0,3.0,2.0,2.0,2.0,4.0,2.0,4.0,2.0,4.0,3.0,3.0,4.0,4.0,5.0,2.0,2.0,3.0,3.0,2.0,2.0,3.0,4.0,1.0,3.0,1.0,4.0,1.0,3.0,4.0,4.0,3.0,14075.0,6301.0,3611.0,4143.0,5949.0,4900.0,10292.0,3254.0,3896.0,4297.0,5380.0,8667.0,6565.0,3442.0,4335.0,10107.0,15301.0,6697.0,4305.0,3574.0,44820.0,15434.0,7953.0,6857.0,9700.0,5049.0,6720.0,4937.0,12857.0,6981.0,10642.0,8187.0,3713.0,15412.0,4186.0,7606.0,3064.0,4122.0,3769.0,6718.0,5027.0,6272.0,7023.0,8974.0,8198.0,4774.0,11819.0,8736.0,3880.0,4768.0,989963.0,57.0,441.0,13.0,520.0,29.0,0.0,208.0,10546.0,-97.0\n',
       '995033,3.0,2.0,4.0,3.0,4.0,2.0,4.0,3.0,4.0,3.0,3.0,3.0,4.0,4.0,3.0,3.0,4.0,5.0,3.0,3.0,2.0,5.0,2.0,5.0,2.0,5.0,2.0,4.0,5.0,4.0,3.0,5.0,3.0,3.0,2.0,4.0,3.0,3.0,1.0,4.0,3.0,3.0,5.0,3.0,3.0,2.0,3.0,2.0,4.0,4.0,7811.0,12321.0,728.0,2997.0,5020.0,515.0,10110.0,11314.0,4200.0,3473.0,2808.0,10826.0,7022.0,1590.0,917.0,2999.0,3919.0,661.0,2962.0,7594.0,4430.0,2462.0,7012.0,3126.0,4808.0,3359.0,623.0,5745.0,580.0,4144.0,2358.0,4829.0,735.0,3742.0,6546.0,5603.0,5158.0,6435.0,7207.0,6272.0,2231.0,5253.0,2480.0,18923.0,18792.0,14734.0,7294.0,7964.0,4358.0,4733.0,443675.0,36.0,272.0,3.0,285.0,15.0,0.0,208.0,9322.0,-76.3729\n'])


# 把数据变到pandas中
pd.read_csv(StringIO(''.join(q)), header=None)


# 如果数据没有索引，则使用pandas中的skiprows与nrows参数进行尝试
# skiprows: 本次导入跳过前skiprows行
# nrows：本次导入只导入nrows行
# 例如，当skiprows=1000, nrows=1000时，pandas会导入1001~2000行
# 当skiprows超出数据量时，就会报空值错误EmptyDataError

for i in range(0,10**7,100000):
    df = pd.read_csv(trainpath, skiprows=i, nrows=1)
    print(i)

0
100000
200000
300000
400000
500000
600000
700000
800000
900000

---------------------------------------------------------------------------
EmptyDataError                            Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_11668\4120898153.py in <module>
      6 
      7 for i in range(0,10**7,100000):
----> 8     df = pd.read_csv(trainpath,skiprows=i, nrows=1)
      9     print(i)

~\anaconda3\envs\kaggle\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
    309                     stacklevel=stacklevel,
    310                 )
--> 311             return func(*args, **kwargs)
    312 
    313         return wrapper

~\anaconda3\envs\kaggle\lib\site-packages\pandas\io\parsers\readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    584     kwds.update(kwds_defaults)
    585 
--> 586     return _read(filepath_or_buffer, kwds)
    587 
    588 

~\anaconda3\envs\kaggle\lib\site-packages\pandas\io\parsers\readers.py in _read(filepath_or_buffer, kwds)
    480 
    481     # Create the parser.
--> 482     parser = TextFileReader(filepath_or_buffer, **kwds)
    483 
    484     if chunksize or iterator:

~\anaconda3\envs\kaggle\lib\site-packages\pandas\io\parsers\readers.py in __init__(self, f, engine, **kwds)
    809             self.options["has_index_names"] = kwds["has_index_names"]
    810 
--> 811         self._engine = self._make_engine(self.engine)
    812 
    813     def close(self):

~\anaconda3\envs\kaggle\lib\site-packages\pandas\io\parsers\readers.py in _make_engine(self, engine)
   1038             )
   1039         # error: Too many arguments for "ParserBase"
-> 1040         return mapping[engine](self.f, **self.options)  # type: ignore[call-arg]
   1041 
   1042     def _failover_to_python(self):

~\anaconda3\envs\kaggle\lib\site-packages\pandas\io\parsers\c_parser_wrapper.py in __init__(self, src, **kwds)
     67         kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
     68         try:
---> 69             self._reader = parsers.TextReader(self.handles.handle, **kwds)
     70         except Exception:
     71             self.handles.close()

~\anaconda3\envs\kaggle\lib\site-packages\pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()

EmptyDataError: No columns to parse from file


[*range(0,10**6,50000)]

[0,
 50000,
 100000,
 150000,
 200000,
 250000,
 300000,
 350000,
 400000,
 450000,
 500000,
 550000,
 600000,
 650000,
 700000,
 750000,
 800000,
 850000,
 900000,
 950000]


looprange = range(0,10**6,50000) # 循环范围


reg = RFR(n_estimators=10
          ,random_state=1412
          ,warm_start=True
          ,verbose=True # 增量学习的过程总是很漫长的，你可以选择展示学习过程
          ,n_jobs=-1    # 调用你全部的资源进行训练
         )


# 定义测试集
test = pd.read_csv(testpath,header="infer",index_col=0)
Xtest = test.iloc[:,:-1]
Ytest = test.iloc[:,-1]


Xtest.head()


# 当skiprows+nrows超出数据量的时候，会发生什么？


trainsubset = pd.read_csv(trainpath, header=None, index_col=0
                          , skiprows=950000
                          , nrows=50000)


# 答：会导出全部剩下的数据，即便不足200w
trainsubset.tail(5) # 训练集子集


trainsubset.shape

(45035, 110)


for line in looprange:
    if line == 0:
        #首次读取时，保留列名，并且不增加树的数量
        header = "infer"
        newtree = 0
    else:
        #非首次读取时，不要列名，每次增加10棵树
        header = None
        newtree = 10
    
    trainsubset = pd.read_csv(trainpath, header = header, index_col=0, skiprows=line, nrows=50000)
    Xtrain = trainsubset.iloc[:,:-1]
    Ytrain = trainsubset.iloc[:,-1]
    reg.n_estimators += newtree
    reg = reg.fit(Xtrain,Ytrain)
    print("DONE",line+50000)
        
    #当训练集的数据量小于50000时，打断循环
    if Xtrain.shape[0] < 50000:
        break

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    6.0s remaining:    4.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.3s finished

DONE 50000

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    6.5s remaining:    4.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.8s finished

DONE 100000

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    7.1s remaining:    4.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.8s finished

DONE 150000

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    9.1s remaining:    6.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   12.5s finished

DONE 200000

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    6.0s remaining:    4.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.4s finished

DONE 250000


reg.score(Xtest,Ytest) #R2 99%，这可能与测试集上的数据太少有关

C:\Users\zhiyuan\anaconda3\envs\kaggle\lib\site-packages\sklearn\base.py:444: UserWarning: X has feature names, but RandomForestRegressor was fitted without feature names
  f"X has feature names, but {self.__class__.__name__} was fitted without"
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    0.2s finished

0.9903773275013429


r = np.array([-1,-1,-1, 1, 1, 1, 1]) #-1，1


(r == 1).sum()

4


(r == -1).sum()

3


r = np.array([-1,-1,-1, 1, 1, 1, 1])


r.mean()

0.14285714285714285


def sigmoid(z):
    return 1/(1+np.e**(-z))


sigmoid(r.mean())

0.5356536708339716


# 有三棵树
1,2,3

# 任意两棵树判断错误
A (1,2),3    -  error * error * (1-error)
B (1),2,(3)  -  error * (1 - error) * error
C 1,(2,3)    - (1 - error) * error * error


3(error^2 * (1-error))


P(A或者B或者C) = P(A) + P(B) + P(C)


import numpy as np
from scipy.special import comb # 计算组合

np.array([comb(25,i)*(0.2**i)*((1-0.2)**(25-i)) for i in range(13,26)]).sum()

0.00036904803455582827


import numpy as np

x = np.linspace(0,1,20)

y = []
for epsilon in np.linspace(0,1,20):
    E = np.array([comb(25,i)*(epsilon**i)*((1-epsilon)**(25-i)) 
                  for i in range(13,26)]).sum()
    y.append(E)
plt.plot(x,y,"o-")
plt.plot(x,x,"--",color="red")
plt.xlabel("individual estimator's error")
plt.ylabel("RandomForest's error")
plt.grid()
plt.show()

Bagging算法	集成类
随机森林分类	RandmForestClassifier
随机森林回归	RandomForestRegressor
极端随机树分类	ExtraTreesClassifier
极端随机树回归	ExtraTreesRegressor
装袋法分类	BaggingClassifier
装袋法回归	BaggingRegressor

类型	参数
弱分类器数量、森林中有多少颗树	n_estimators
弱分类器的训练数据	bootstrap, oob_score, max_samples, max_features, random_state
弱分类器结构、树深度等	criterion, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_leaf_nodes, min_impurity_decrease
其他	n_jobs, verbose, ccp_alpha

影响力	参数
⭐⭐⭐⭐⭐ 几乎总是具有巨大影响力	n_estimators（整体学习能力） max_depth（粗剪枝） max_features（随机性）
⭐⭐⭐⭐ 大部分时候具有影响力	max_samples（随机性） class_weight（分类树中的样本均衡）
⭐⭐ 可能有大影响力大部分时候影响力不明显	min_samples_split（精剪枝） min_impurity_decrease（精剪枝） max_leaf_nodes（精剪枝） criterion（分枝敏感度）
⭐ 当数据量足够大时，几乎无影响	random_state ccp_alpha（结构风险）

参数	参数含义	对应属性	属性含义
n_estimators	树的数量	reg.estimators_	森林中所有树对象
max_depth	允许的最大深度	.tree_.max_depth	0号树实际的深度
max_leaf_nodes	允许的最大叶子节点量	.tree_.node_count	0号树实际的总节点量
min_sample_split	分枝所需最小样本量	.tree_.n_node_samples	0号树每片叶子上实际的样本量
min_weight_fraction_leaf	分枝所需最小样本权重	tree_.weighted_n_node_samples	0号树每片叶子上实际的样本权重
min_impurity_decrease	分枝所需最小不纯度下降量	.tree.impurity .tree.threshold	0号树每片叶子上的实际不纯度 0号树每个节点分枝后不纯度下降量

影响力	参数
⭐⭐⭐⭐⭐ 几乎总是具有巨大影响力	n_estimators（整体学习能力） max_depth（粗剪枝） max_features（随机性）
⭐⭐⭐⭐ 大部分时候具有影响力	max_samples（随机性） class_weight（样本均衡）
⭐⭐ 可能有大影响力大部分时候影响力不明显	min_samples_split（精剪枝） min_impurity_decrease（精剪枝） max_leaf_nodes（精剪枝） criterion（分枝敏感度）
⭐ 当数据量足够大时，几乎无影响	random_state ccp_alpha（结构风险）

集成学习¶

一集成学习的三大关键领域¶

二 Bagging方法的基本思想¶

三随机森林RandomForest¶

1 RandomForestRegressor的实现¶

2 随机森林回归器的参数¶

2.1 弱分类器结构¶

2.2 弱分类器数量¶

2.3 弱分类器训练的数据¶

2.4 其他参数¶

四集成算法的参数空间与网格优化（集成算法的统一调参）¶

五随机森林在巨量数据上的增量学习¶

1 普通学习 vs 增量学习¶

2 实际应用例子：增量学习在Kaggle数据上的应用¶

六 Bagging方法6大面试热点问题¶

	Id	住宅类型	住宅区域	街道接触面积(英尺)	住宅面积	街道路面状况	住宅形状(大概)	住宅现状	...	销售月份	销售年份	销售类型	销售状态	SalePrice
0	0.0	5.0	3.0	36.0	327.0	1.0	3.0	3.0	...	1.0	2.0	8.0	4.0	208500
1	1.0	0.0	3.0	51.0	498.0	1.0	3.0	3.0	...	4.0	1.0	8.0	4.0	181500
2	2.0	5.0	3.0	39.0	702.0	1.0	0.0	3.0	...	8.0	2.0	8.0	4.0	223500
3	3.0	6.0	3.0	31.0	489.0	1.0	0.0	3.0	...	1.0	0.0	8.0	0.0	140000
4	4.0	5.0	3.0	55.0	925.0	1.0	0.0	3.0	...	11.0	2.0	8.0	4.0	250000

cross_validate	参数
n_jobs	允许该程序调用的线程数
verbose	是否打印进度

	0	1	2	3	4	5	6	7	8	9	...	101	102	103	104	105	106	108	109	110
0	995029	3.0	3.0	5.0	5.0	2.0	3.0	2.0	5.0	5.0	...	291658.0	666.0	469.0	37.0	1954.0	33.0	41.0	865.0	-70.6503
1	995030	2.0	4.0	4.0	2.0	4.0	2.0	4.0	4.0	4.0	...	968800.0	666.0	469.0	6.0	208.0	30.0	208.0	19838.0	-123.0867
2	995031	2.0	1.0	3.0	2.0	5.0	1.0	5.0	4.0	4.0	...	567037.0	93.0	541.0	596.0	2892.0	1602.0	144.0	2745.0	112.5000
3	995032	1.0	4.0	1.0	5.0	2.0	2.0	1.0	5.0	2.0	...	989963.0	57.0	441.0	13.0	520.0	29.0	208.0	10546.0	-97.0000
4	995033	3.0	2.0	4.0	3.0	4.0	2.0	4.0	3.0	4.0	...	443675.0	36.0	272.0	3.0	285.0	15.0	208.0	9322.0	-76.3729

	EXT1	EXT2	EXT3	EXT4	EXT5	EXT6	EXT7	EXT8	EXT9	EXT10	...	OPN10_E	dateload	screenw	screenh	introelapse	testelapse	endelapse	IPC	country	lat_appx_lots_of_err
0	3	2	4	2	4	2	2	2	4	2	...	1398	971898	396	469	60	188	12	0	208	8663
1	4	1	4	2	5	1	5	2	5	2	...	995	814072	1024	755	168	105	6	0	208	20723
2	3	5	3	3	3	5	3	5	1	5	...	3533	86908	396	469	26	316	20	0	61	30695
3	3	2	5	5	3	2	4	3	2	2	...	1533	822586	666	469	3	323	17	0	51	27919
4	2	1	5	1	5	1	5	3	2	5	...	2491	728955	666	469	26	210	225	39	12	434

集成学习¶

一 集成学习的三大关键领域¶

二 Bagging方法的基本思想¶

三 随机森林RandomForest¶

1 RandomForestRegressor的实现¶

2 随机森林回归器的参数¶

2.1 弱分类器结构¶

2.2 弱分类器数量¶

2.3 弱分类器训练的数据¶

2.4 其他参数¶

四 集成算法的参数空间与网格优化（集成算法的统一调参）¶

五 随机森林在巨量数据上的增量学习¶

1 普通学习 vs 增量学习¶

2 实际应用例子：增量学习在Kaggle数据上的应用¶

六 Bagging方法6大面试热点问题¶

一集成学习的三大关键领域¶

三随机森林RandomForest¶

四集成算法的参数空间与网格优化（集成算法的统一调参）¶

五随机森林在巨量数据上的增量学习¶