import numpy as np
import pandas as pd
import sklearn
import matplotlib as mlp
import matplotlib.pyplot as plt
import seaborn as sns
import time
#import re, pip, conda


"""
for package in [sklearn,mlp,np,pd,sns,pip,conda]:
    print(re.findall("([^']*)",str(package))[2],package.__version__)
"""

'\nfor package in [sklearn,mlp,np,pd,sns,pip,conda]:\n    print(re.findall("([^\']*)",str(package))[2],package.__version__)\n'


#pip install --upgrade scikit-learn
#conda update scikit-learn


from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import cross_validate, KFold, GridSearchCV


data = pd.read_csv(r"..\Lesson 09.随机森林模型\datasets\House Price\train_encode.csv",index_col=0)


X = data.iloc[:,:-1] # 特征
y = data.iloc[:,-1]  # 标签


X.shape

(1460, 80)


X.head()


y.head() # 标签

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64


y.describe() #RMSE

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64


#参数空间
param_grid_simple = {"criterion": ["squared_error","poisson"]
                     , 'n_estimators': [*range(20,100,5)]
                     , 'max_depth': [*range(10,25,2)]
                     , "max_features": ["log2","sqrt",16,32,64,"auto"]
                     , "min_impurity_decrease": [*np.arange(0,5,10)]
                    }


#参数空间大小计算

2 * len([*range(20,100,5)]) * len([*range(10,25,2)]) * len(["log2","sqrt",16,32,64,"auto"]) * len([*np.arange(0,5,10)])

1536


#直接使用循环计算
no_option = 1
for i in param_grid_simple:
    no_option *= len(param_grid_simple[i])


no_option # 最终备选项数目，组合数量

1536


#模型，交叉验证，网格搜索
reg = RFR(random_state=1412,verbose=True,n_jobs=-1)
cv = KFold(n_splits=5,shuffle=True,random_state=1412)
search = GridSearchCV(estimator=reg
                     ,param_grid=param_grid_simple
                     ,scoring = "neg_mean_squared_error"
                     ,verbose = True
                     ,cv = cv
                     ,n_jobs=-1)


#=====【TIME WARNING: 7mins】=====#
start = time.time()
search.fit(X,y)
print(time.time() - start)

Fitting 5 folds for each of 1536 candidates, totalling 7680 fits
381.6039867401123

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  85 out of  85 | elapsed:    0.0s finished


381.6039/60 # 运行分钟

6.3600650000000005


search.best_estimator_

RandomForestRegressor(max_depth=23, max_features=16, min_impurity_decrease=0,
                      n_estimators=85, n_jobs=-1, random_state=1412,
                      verbose=True)


abs(search.best_score_)**0.5 # 最优RMSE

29179.698261599166


# 按最优参数重建模型，查看效果
ad_reg = RFR(n_estimators=85, max_depth=23, max_features=16, random_state=1412)


cv = KFold(n_splits=5,shuffle=True,random_state=1412)
result_post_adjusted = cross_validate(ad_reg,X,y,cv=cv,scoring="neg_mean_squared_error"
                          ,return_train_score=True
                          ,verbose=True
                          ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished


def RMSE(cvresult,key):
    return (abs(cvresult[key])**0.5).mean()


RMSE(result_post_adjusted,"train_score")

11000.81099038192


RMSE(result_post_adjusted,"test_score")

28572.070208366855


# 打包成函数供后续使用
# 评估指标RMSE
def RMSE(cvresult,key):
    return (abs(cvresult[key])**0.5).mean()

# 计算参数空间大小
def count_space(param):
    no_option = 1
    for i in param_grid_simple:
        no_option *= len(param_grid_simple[i])
    print(no_option)
    
# 在最优参数上进行重新建模验证结果
def rebuild_on_best_param(ad_reg):
    cv = KFold(n_splits=5,shuffle=True,random_state=1412)
    result_post_adjusted = cross_validate(ad_reg,X,y,cv=cv,scoring="neg_mean_squared_error"
                                          ,return_train_score=True
                                          ,verbose=True
                                          ,n_jobs=-1)
    print("训练RMSE:{:.3f}".format(RMSE(result_post_adjusted,"train_score")))
    print("测试RMSE:{:.3f}".format(RMSE(result_post_adjusted,"test_score")))


fig, [ax1, ax2] = plt.subplots(1,2,dpi=300)
n_e_list = [*range(50,350,50)]
m_d_list = [*range(2,7)]
comb = pd.DataFrame([(n_estimators, max_depth) for n_estimators in n_e_list for max_depth in m_d_list])

ax1.scatter(comb.iloc[:,0],comb.iloc[:,1],cmap="Blues")
ax1.set_xticks([*range(50,350,50)])
ax1.set_yticks([*range(2,7)])
ax1.set_xlabel("n_estimators")
ax1.set_ylabel("max_depth")
ax1.set_title("GridSearch")

ax2.scatter(comb.iloc[:,0],comb.iloc[:,1],cmap="Blues")
ax2.scatter([50,250,200,200,300,100,150,150],[4,2,6,3,2,3,2,5],cmap="red",s=20,linewidths=5)
ax2.set_xticks([*range(50,350,50)])
ax2.set_yticks([*range(2,7)])
ax2.set_xlabel("n_estimators")
ax2.set_ylabel("max_depth")
ax2.set_title("RandomSearch");


from sklearn.model_selection import RandomizedSearchCV


X.shape

(1460, 80)


X.head()


y.describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64


# 定义全域参数空间 - 使用与网格搜索时完全一致的空间，以便于对比
param_grid_simple = {"criterion": ["squared_error","poisson"]
                     , 'n_estimators': [*range(20,100,5)]
                     , 'max_depth': [*range(10,25,2)]
                     , "max_features": ["log2","sqrt",16,32,64,"auto"]
                     , "min_impurity_decrease": [*np.arange(0,5,10)]
                    }

# 建立用于训练的回归器、交叉验证
reg = RFR(random_state=1412, verbose=True, n_jobs=-1)
cv = KFold(n_splits=5, shuffle=True, random_state=1412)


# 计算全域参数空间大小，这是我们能够抽样的最大值
count_space(param_grid_simple)

1536


# 定义随机搜索
search = RandomizedSearchCV(estimator=reg
                            ,param_distributions=param_grid_simple
                            ,n_iter = 800 #设置的子空间的大小是全域空间的一半左右
                            ,scoring = "neg_mean_squared_error"
                            ,verbose = True
                            ,cv = cv
                            ,random_state=1412
                            ,n_jobs=-1
                           )


# 训练随机搜索评估器
#=====【TIME WARNING: 5~10min】=====#
start = time.time()
search.fit(X,y)
print(time.time() - start)

Fitting 5 folds for each of 800 candidates, totalling 4000 fits
170.16785073280334

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  85 out of  85 | elapsed:    0.0s finished


170.1678/60 # 耗时分钟

2.83613


# 查看模型结果，即最优参数
search.best_estimator_

RandomForestRegressor(max_depth=24, max_features=16, min_impurity_decrease=0,
                      n_estimators=85, n_jobs=-1, random_state=1412,
                      verbose=True)


abs(search.best_score_)**0.5 # 最好的RMSE

29251.284326350575


# 根据最优参数重建模型
ad_reg = RFR(max_depth=24, max_features=16, min_impurity_decrease=0,
                      n_estimators=85, n_jobs=-1, random_state=1412,
                      verbose=True)
# 调用自定义的函数
rebuild_on_best_param(ad_reg)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.

训练RMSE:11031.299
测试RMSE:28639.969

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished


from mpl_toolkits.mplot3d import axes3d


# 自动获取数据的功能get_test_data，可以自动生成复合某一分布的数据
p1, p2, MSE = axes3d.get_test_data(0.05)

# 我们现在假设这一组数据中有两个参数，p1与p2，两个参数组成的参数组合对应着损失函数值MSE
# 参数0.05是指参数空间中，点与点之间的距离
# 因此该数字越小，取出来的样本越多


len(p1) #参数1的取值有120个

120


len(p2) #参数2的取值也有120个

120


MSE.shape #损失函数值，总共14400个点

(120, 120)


#绘制P1与P2的参数空间 - 这是一个呈现出14400个点的密集空间
plt.figure(dpi=300)
plt.scatter(p1,p2,s=0.2)
plt.xticks(fontsize=9)
plt.yticks(fontsize=9);


# 参数与损失共同构建的函数图像
p1, p2, MSE = axes3d.get_test_data(0.05)
plt.figure(dpi=300)
ax = plt.axes(projection="3d")
ax.plot_wireframe(p1,p2,MSE,rstride=2,cstride=2,linewidth=0.5)
#ax.view_init(2, -15) # 旋转下3D图，看的更清楚
ax.zaxis.set_tick_params(labelsize=7)
ax.xaxis.set_tick_params(labelsize=7)
ax.yaxis.set_tick_params(labelsize=7);


# 参数与损失共同构建的函数图像
p1, p2, MSE = axes3d.get_test_data(0.05)
plt.figure(dpi=300)
ax = plt.axes(projection="3d")
ax.plot_wireframe(p1,p2,MSE,rstride=2,cstride=2,linewidth=0.5)
ax.view_init(2, -15) # 旋转下3D图，看的更清楚
ax.zaxis.set_tick_params(labelsize=7)
ax.xaxis.set_tick_params(labelsize=7)
ax.yaxis.set_tick_params(labelsize=7);


np.min(MSE) #整个参数空间中，可获得的MSE最小值是-73.39

-73.39620971601681


# 现在，我们从该空间上抽取子空间
import numpy as np


MSE.shape

(120, 120)


# 我们从空间中抽取n个组合，n越大子空间越大
# 现在总共有14400个组合

# 对被抽中的点来说，损失函数的值就是MSE，对没有抽中的点来说，损失函数值是空值
# 因此，我们只需要找出没有抽中的点，并让它的损失函数值MSE为空就可以了

n = 100


# 从0~14400中生成(14400-n)个随机数，形成没有被抽到子空间中的点的索引
unsampled = np.random.randint(0,14400,14400-n) # np.random.randint：随机生成正整数

p1, p2, MSE = axes3d.get_test_data(0.05)

# 拉平MSE，并将所有没抽中的点的损失函数变为空值
MSE = MSE.ravel()            # 拉平数组
MSE[unsampled] = np.nan      # 没有抽中的赋空值
MSE = MSE.reshape((120,120)) # 设置完毕空值后，记得把MSE恢复成原来的结构，否则绘图报错

# 参数与损失共同构建的函数
plt.figure(dpi=300)
ax = plt.axes(projection="3d")
ax.view_init(2, -15)
ax.plot_wireframe(p1,p2,MSE,rstride=2,cstride=2,linewidth=0.5)
ax.zaxis.set_tick_params(labelsize=7)
ax.xaxis.set_tick_params(labelsize=7)
ax.yaxis.set_tick_params(labelsize=7);

# 求出当前损失函数上的最小值
# 注意此时因为MSE中含有了空值，因此要先排除空值影响，否则min函数会返回空值
MSE = MSE.ravel().tolist()
MSE = [x for x in MSE if str(x) != 'nan']
print(np.min(MSE))

-73.24243733589367


# 创造参数空间 - 让整体参数空间变得更密
param_grid_simple = {'n_estimators': [*range(80,100,1)]
                     , 'max_depth': [*range(10,25,1)]
                     , "max_features": [*range(10,20,1)]
                     , "min_impurity_decrease": [*np.arange(0,5,10)]
                    }


# 调用自定义函数，计算全域参数空间大小，这是我们能够抽样的最大值
count_space(param_grid_simple)

3000


#建立回归器、交叉验证
reg = RFR(random_state=1412,verbose=True,n_jobs=-1)
cv = KFold(n_splits=5,shuffle=True,random_state=1412)

#定义随机搜索
search = RandomizedSearchCV(estimator=reg
                            ,param_distributions=param_grid_simple
                            ,n_iter = 1536 #使用与枚举网格搜索类似的拟合次数
                            ,scoring = "neg_mean_squared_error"
                            ,verbose = True
                            ,cv = cv
                            ,random_state=1412
                            ,n_jobs=-1)


#训练随机搜索评估器
#=====【TIME WARNING: 5~10min】=====#
start = time.time()
search.fit(X,y)
end = time.time() - start
print(end/60)

Fitting 5 folds for each of 1536 candidates, totalling 7680 fits
3.8464645385742187

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  89 out of  89 | elapsed:    0.0s finished


#查看最佳评估器
search.best_estimator_

RandomForestRegressor(max_depth=22, max_features=14, min_impurity_decrease=0,
                      n_estimators=89, n_jobs=-1, random_state=1412,
                      verbose=True)


#查看最终评估指标
abs(search.best_score_)**0.5

29012.90569846546


rebuild_on_best_param(search.best_estimator_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.

训练RMSE:11208.818
测试RMSE:28346.673

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished


import scipy # 使用scipy来帮助我们建立分布


scipy.stats.uniform(loc=1,scale=100) # stats模块中包含各种数学分布，uniform是均匀分布

<scipy.stats._distn_infrastructure.rv_frozen at 0x137a147d7c0>


param_grid_simple = {'n_estimators': [*range(80,100,1)]
                     , 'max_depth': [*range(10,25,1)]
                     , "max_features": [*range(10,20,1)]
                     , "min_impurity_decrease": scipy.stats.uniform(0,50)
                    }


#建立回归器、交叉验证
reg = RFR(random_state=1412,verbose=True,n_jobs=12)
cv = KFold(n_splits=5,shuffle=True,random_state=1412)

#定义随机搜索
search = RandomizedSearchCV(estimator=reg
                            ,param_distributions=param_grid_simple
                            ,n_iter = 1536 #还是使用1536这个搜索次数
                            ,scoring = "neg_mean_squared_error"
                            ,verbose = True
                            ,cv = cv
                            ,random_state=1412
                            ,n_jobs=12)


#训练随机搜索评估器
#=====【TIME WARNING: 5~10min】=====#
start = time.time()
search.fit(X,y)
end = time.time() - start
print(end/60)

Fitting 5 folds for each of 1536 candidates, totalling 7680 fits
3.921058924992879

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done  98 out of  98 | elapsed:    0.0s finished


#查看最佳评估器
search.best_estimator_

RandomForestRegressor(max_depth=22, max_features=14,
                      min_impurity_decrease=20.070367229896224, n_estimators=98,
                      n_jobs=12, random_state=1412, verbose=True)


#查看最终评估指标
abs(search.best_score_)**0.5

29148.381610182565


rebuild_on_best_param(search.best_estimator_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.

训练RMSE:11184.428
测试RMSE:28495.682

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished


n = 10
for i in range(15):
    print(i,n*3**i)

0 10
1 30
2 90
3 270
4 810
5 2430
6 7290
7 21870
8 65610
9 196830
10 590490
11 1771470
12 5314410
13 15943230
14 47829690


data2 = pd.read_csv(r"..\Lesson 09.随机森林模型\datasets\House Price\big_train.csv",index_col=0)


X = data2.iloc[:,:-1]
y = data2.iloc[:,-1]


X.shape

(29062, 80)


y.describe()

count     29062.000000
mean     182798.864703
std       72379.404452
min       34900.000000
25%      139000.000000
50%      169092.000000
75%      203009.750000
max      755000.000000
Name: SalePrice, dtype: float64


X.head()


import re
import sklearn
import numpy as np
import pandas as pd
import matplotlib as mlp
import matplotlib.pyplot as plt
import time
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import KFold, HalvingGridSearchCV, cross_validate, RandomizedSearchCV


param_grid_simple = {"criterion": ["squared_error","poisson"]
                     , 'n_estimators': [*range(20,100,5)]
                     , 'max_depth': [*range(10,25,2)]
                     , "max_features": ["log2","sqrt",16,32,64,"auto"]
                     , "min_impurity_decrease": [*np.arange(0,5,10)]
                    }


count_space(param_grid_simple)

1536


X.shape

(29062, 80)


#2.9w个样本在factor=2, min_resource = 100的情况下可以迭代多久？
for i in range(100):
    if 100*2**i > 29062:
        break
    print(i+1,100*2**i)

1 100
2 200
3 400
4 800
5 1600
6 3200
7 6400
8 12800
9 25600


#1536种参数组合在factor=2的情况下可以迭代多久？
for i in range(100):
    if 1536//2**i < 1:
        break
    print(i+1,int(1536//2**i+1)) #向上取整

1 1537
2 769
3 385
4 193
5 97
6 49
7 25
8 13
9 7
10 4
11 2


factor = 1.5
n_samples = X.shape[0]
min_resources = 500
space = 1536


for i in range(100):
    if (min_resources*factor**i > n_samples) or (space/factor**i < 1):
        break
    print(i+1,"本轮迭代样本:{}".format(min_resources*factor**i)
          ,"本轮验证参数组合:{}".format(space//factor**i + 1))

1 本轮迭代样本:500.0 本轮验证参数组合:1537.0
2 本轮迭代样本:750.0 本轮验证参数组合:1025.0
3 本轮迭代样本:1125.0 本轮验证参数组合:683.0
4 本轮迭代样本:1687.5 本轮验证参数组合:456.0
5 本轮迭代样本:2531.25 本轮验证参数组合:304.0
6 本轮迭代样本:3796.875 本轮验证参数组合:203.0
7 本轮迭代样本:5695.3125 本轮验证参数组合:135.0
8 本轮迭代样本:8542.96875 本轮验证参数组合:90.0
9 本轮迭代样本:12814.453125 本轮验证参数组合:60.0
10 本轮迭代样本:19221.6796875 本轮验证参数组合:40.0
11 本轮迭代样本:28832.51953125 本轮验证参数组合:27.0


#建立回归器、交叉验证
reg = RFR(random_state=1412,verbose=True,n_jobs=-1)
cv = KFold(n_splits=5,shuffle=True,random_state=1412)

#定义对半搜索
search = HalvingGridSearchCV(estimator=reg
                            ,param_grid=param_grid_simple
                            ,factor=1.5
                            ,min_resources=500
                            ,scoring = "neg_mean_squared_error"
                            ,verbose = True
                            ,random_state=1412
                            ,cv = cv
                            ,n_jobs=-1)


#训练对半搜索评估器
#=====【TIME WARNING: 30~50min】=====#
start = time.time()
search.fit(X,y)
end = time.time() - start
print(end/60)

n_iterations: 11
n_required_iterations: 19
n_possible_iterations: 11
min_resources_: 500
max_resources_: 29062
aggressive_elimination: False
factor: 1.5
----------
iter: 0
n_candidates: 1536
n_resources: 500
Fitting 5 folds for each of 1536 candidates, totalling 7680 fits
----------
iter: 1
n_candidates: 1024
n_resources: 750
Fitting 5 folds for each of 1024 candidates, totalling 5120 fits
----------
iter: 2
n_candidates: 683
n_resources: 1125
Fitting 5 folds for each of 683 candidates, totalling 3415 fits
----------
iter: 3
n_candidates: 456
n_resources: 1687
Fitting 5 folds for each of 456 candidates, totalling 2280 fits
----------
iter: 4
n_candidates: 304
n_resources: 2531
Fitting 5 folds for each of 304 candidates, totalling 1520 fits
----------
iter: 5
n_candidates: 203
n_resources: 3796
Fitting 5 folds for each of 203 candidates, totalling 1015 fits
----------
iter: 6
n_candidates: 136
n_resources: 5695
Fitting 5 folds for each of 136 candidates, totalling 680 fits
----------
iter: 7
n_candidates: 91
n_resources: 8542
Fitting 5 folds for each of 91 candidates, totalling 455 fits
----------
iter: 8
n_candidates: 61
n_resources: 12814
Fitting 5 folds for each of 61 candidates, totalling 305 fits
----------
iter: 9
n_candidates: 41
n_resources: 19221
Fitting 5 folds for each of 41 candidates, totalling 205 fits
----------
iter: 10
n_candidates: 28
n_resources: 28832
Fitting 5 folds for each of 28 candidates, totalling 140 fits

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.4s

25.638246742884316

[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    1.2s finished


#查看最佳评估器
search.best_estimator_

RandomForestRegressor(max_depth=22, max_features=16, min_impurity_decrease=0,
                      n_estimators=90, n_jobs=-1, random_state=1412,
                      verbose=True)


#查看最佳评估器
abs(search.best_score_)**0.5

1068.281607238587


#验证最佳参数组合的效力
rebuild_on_best_param(search.best_estimator_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.2s finished

训练RMSE:475.740
测试RMSE:1082.916


param_grid_simple = {"criterion": ["squared_error","poisson"]
                     , 'n_estimators': [*range(20,100,5)]
                     , 'max_depth': [*range(10,25,2)]
                     , "max_features": ["log2","sqrt",16,32,64,"auto"]
                     , "min_impurity_decrease": [*np.arange(0,5,10)]
                    }

reg = RFR(random_state=1412,verbose=True,n_jobs=-1)
cv = KFold(n_splits=5,shuffle=True,random_state=1412)


#定义随机搜索
search = RandomizedSearchCV(estimator=reg
                            ,param_distributions=param_grid_simple
                            ,n_iter = 800 #使用全域空间的一半作为子空间
                            ,scoring = "neg_mean_squared_error"
                            ,verbose = True
                            ,random_state=1412
                            ,cv = cv
                            ,n_jobs=-1)


#训练随机搜索评估器
#=====【TIME WARNING: 1个半小时~2小时】=====#
start = time.time()
search.fit(X,y)
end = time.time()-start
print(end/60)

Fitting 5 folds for each of 800 candidates, totalling 4000 fits

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.3s

103.20144965251286

[Parallel(n_jobs=-1)]: Done  85 out of  85 | elapsed:    1.2s finished


#查看最佳评估器
search.best_estimator_

RandomForestRegressor(max_depth=24, max_features=16, min_impurity_decrease=0,
                      n_estimators=85, n_jobs=-1, random_state=1412,
                      verbose=True)


#查看最终评估指标
abs(search.best_score_)**0.5

1055.5552571413887


#验证最佳参数组合的效力
rebuild_on_best_param(search.best_estimator_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.1s finished

训练RMSE:465.198
测试RMSE:1054.359


#示例代码，本段代码不提供运行

#安装
!pip install hpsklearn

#导入
from hpsklearn import HyperoptEstimator

#实例化
estim = HyperoptEstimator()

#训练
estim.fit(Xtrain, Ytrain)
prediction = estim.predict(Xtest)
score = estin.score(Xtest,Ytest)

#直接返回sklearn中存在的，选择出的最好模型
model = estim.best_model()

HPO方法	默认参数	网格搜索
搜索空间/全域空间	-	1536/1536
运行时间（分钟）	-	6.36
搜索最优（RMSE）	30571.266	29179.698
重建最优（RMSE）	-	28572.070

Name	Description
estimator	调参对象，某评估器
param_distributions	全域参数空间，可以是字典或者字典构成的列表
n_iter	迭代次数，迭代次数越多，抽取的子参数空间越大
scoring	评估指标，支持同时输出多个参数
n_jobs	设置工作时参与计算的线程数
refit	挑选评估指标和最佳参数，在完整数据集上进行训练
cv	交叉验证的折数
verbose	输出工作日志形式
pre_dispatch	多任务并行时任务划分数量
random_state	随机数种子
error_score	当网格搜索报错时返回结果，选择'raise'时将直接报错并中断训练过程，其他情况会显示警告信息后继续完成训练
return_train_score	在交叉验证中是否显示训练集中参数得分

HPO方法	默认参数	网格搜索	随机搜索
搜索空间/全域空间	-	1536/1536	800/1536
运行时间（分钟）	-	6.36	2.83(↓)
搜索最优（RMSE）	30571.266	29179.698	29251.284
重建最优（RMSE）	-	28572.070	28639.969(↑)

HPO方法	默认参数	网格搜索	随机搜索	随机搜索 (大空间)
搜索空间/全域空间	-	1536/1536	800/1536	1536/3000
运行时间（分钟）	-	6.36	2.83(↓)	3.86(↓)
搜索最优（RMSE）	30571.266	29179.698	29251.284	29012.905(↓)
重建最优（RMSE）	-	28572.070	28639.969(↑)	28346.673(↓)

HPO方法	默认参数	网格搜索	随机搜索	随机搜索 (大空间)	随机搜索 (连续型)
搜索空间/全域空间	-	1536/1536	800/1536	1536/3000	1536/无限
运行时间（分钟）	-	6.36	2.83(↓)	3.86(↓)	3.92
搜索最优（RMSE）	30571.266	29179.698	29251.284	29012.905(↓)	29148.381
重建最优（RMSE）	-	28572.070	28639.969(↑)	28346.673(↓)	28495.682

超参数优化之网格优化¶

一超参数优化与枚举网格的理论极限¶

二随机网格搜索（RandomizedSearchCV）¶

三对半网格搜索（HalvingSearchCV）¶

四【加餐】自动化机器学习AutoML¶

AutoML的三大成熟研究领域¶

AutoML的新兴研究领域¶

AutoML真的会替代算法工程师吗？¶

	Id	住宅类型	住宅区域	街道接触面积(英尺)	住宅面积	街道路面状况	住宅形状(大概)	住宅现状	...	销售月份	销售年份	销售类型	销售状态
0	0.0	5.0	3.0	36.0	327.0	1.0	3.0	3.0	...	1.0	2.0	8.0	4.0
1	1.0	0.0	3.0	51.0	498.0	1.0	3.0	3.0	...	4.0	1.0	8.0	4.0
2	2.0	5.0	3.0	39.0	702.0	1.0	0.0	3.0	...	8.0	2.0	8.0	4.0
3	3.0	6.0	3.0	31.0	489.0	1.0	0.0	3.0	...	1.0	0.0	8.0	0.0
4	4.0	5.0	3.0	55.0	925.0	1.0	0.0	3.0	...	11.0	2.0	8.0	4.0

Name	Description
estimator	调参对象，某评估器
param_grid	参数空间，可以是字典或者字典构成的列表
factor	每轮迭代中新增的样本量的比例，同时也是每轮迭代后留下的参数组合的比例
resource	设置每轮迭代中增加的验证资源的类型
max_resources	在一次迭代中，允许被用来验证任意参数组合的最大样本量
min_resources	首次迭代时，用于验证参数组合的样本量r0
aggressive_elimination	是否以全部数被使用完成作为停止搜索的指标，如果不是，则采取措施
cv	交叉验证的折数
scoring	评估指标，支持同时输出多个参数
refit	挑选评估指标和最佳参数，在完整数据集上进行训练
error_score	当网格搜索报错时返回结果，选择'raise'时将直接报错并中断训练过程其他情况会显示警告信息后继续完成训练
return_train_score	在交叉验证中是否显示训练集中参数得分
random_state	控制随机抽样数据集的随机性
n_jobs	设置工作时参与计算的线程数
verbose	输出工作日志形式

迭代次数	子集样本量	参数组合数
1	S	C
2	2S	$\frac{1}{2}$C
3	4S	$\frac{1}{4}$C
4	8S	$\frac{1}{8}$C

超参数优化之网格优化¶

一 超参数优化与枚举网格的理论极限¶

二 随机网格搜索（RandomizedSearchCV）¶

三 对半网格搜索（HalvingSearchCV）¶

四 【加餐】自动化机器学习AutoML¶

AutoML的三大成熟研究领域¶

AutoML的新兴研究领域¶

AutoML真的会替代算法工程师吗？¶

一超参数优化与枚举网格的理论极限¶

二随机网格搜索（RandomizedSearchCV）¶

三对半网格搜索（HalvingSearchCV）¶

四【加餐】自动化机器学习AutoML¶