# 科学计算模块
import numpy as np
import pandas as pd

# 绘图模块
import matplotlib as mpl
import matplotlib.pyplot as plt

# 自定义模块
from ML_basic_function import *


A = np.arange(10).reshape(5, 2)
A

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])


B = np.arange(0, 10, 2).reshape(-1, 1)
B

array([[0],
       [2],
       [4],
       [6],
       [8]])


np.random.seed(24)
np.random.shuffle(A)

A

array([[8, 9],
       [2, 3],
       [0, 1],
       [6, 7],
       [4, 5]])

B

array([[0],
       [2],
       [4],
       [6],
       [8]])


np.random.seed(24)
np.random.shuffle(B)

B

array([[8],
       [2],
       [0],
       [6],
       [4]])

A

array([[8, 9],
       [2, 3],
       [0, 1],
       [6, 7],
       [4, 5]])


# 从行索引的第二、三元素之间切开
np.vsplit(A,[2, ])

[array([[8, 9],
        [2, 3]]),
 array([[0, 1],
        [6, 7],
        [4, 5]])]


def array_split(features, labels, rate=0.7, random_state=24):
    """
    训练集和测试集切分函数
    
    :param features: 输入的特征张量
    :param labels：输入的标签张量
    :param rate：训练集占所有数据的比例
    :random_state：随机数种子值
    :return Xtrain, Xtest, ytrain, ytest：返回特征张量的训练集、测试集，以及标签张量的训练集、测试集 
    """
    
    np.random.seed(random_state)                           
    np.random.shuffle(features)                             # 对特征进行切分
    np.random.seed(random_state)
    np.random.shuffle(labels)                               # 按照相同方式对标签进行切分
    num_input = len(labels)                                 # 总数据量
    split_indices = int(num_input * rate)                   # 数据集划分的标记指标
    Xtrain, Xtest = np.vsplit(features, [split_indices, ])  
    ytrain, ytest = np.vsplit(labels, [split_indices, ])
    return Xtrain, Xtest, ytrain, ytest


f = np.arange(10).reshape(-1, 1)                 # 创建特征0-9
f

array([[0],
       [1],
       [2],
       [3],
       [4],
       [5],
       [6],
       [7],
       [8],
       [9]])


l = np.arange(1, 11).reshape(-1, 1)             # 创建标签1-10，保持和特征+1的关系
l

array([[ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10]])


array_split(f, l)

(array([[9],
        [4],
        [8],
        [7],
        [5],
        [6],
        [1]]),
 array([[0],
        [3],
        [2]]),
 array([[10],
        [ 5],
        [ 9],
        [ 8],
        [ 6],
        [ 7],
        [ 2]]),
 array([[1],
        [4],
        [3]]))


# 设置随机数种子
np.random.seed(24)   

# 扰动项取值为0.01
features, labels = arrayGenReg(delta=0.01)

# 数据切分
Xtrain, Xtest, ytrain, ytest = array_split(features, labels)


w = np.linalg.inv(Xtrain.T.dot(Xtrain)).dot(Xtrain.T).dot(ytrain)
w

array([[ 1.99976073],
       [-0.99986178],
       [ 0.99934303]])


SSELoss(Xtest, w, ytest)

array([[0.02725208]])

Lesson 3.4 机器学习模型结果可信度理论与交叉验证基础¶

一、机器学习模型结果可信度理论基础与数据集划分¶

1.机器学习模型结果可信度基础理论¶

2.数据集切分方法¶

3.线性回归手动实现¶

4.测试集的“不可知”悖论¶

二、交叉验证基本思想¶