In [1]:
#导入相应包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
pip list
Package                            Version            
---------------------------------- -------------------
alabaster                          0.7.12             
anaconda-client                    1.7.2              
anaconda-navigator                 1.9.12             
anaconda-project                   0.8.3              
argh                               0.26.2             
asn1crypto                         1.3.0              
astroid                            2.3.3              
astropy                            4.0                
atomicwrites                       1.3.0              
attrs                              19.3.0             
autopep8                           1.4.4              
Babel                              2.8.0              
backcall                           0.1.0              
backports.functools-lru-cache      1.6.1              
backports.shutil-get-terminal-size 1.0.0              
backports.tempfile                 1.0                
backports.weakref                  1.0.post1          
bcrypt                             3.1.7              
beautifulsoup4                     4.8.2              
bitarray                           1.2.1              
bkcharts                           0.2                
bleach                             3.1.0              
bokeh                              1.4.0              
boto                               2.49.0             
Bottleneck                         1.3.2              
certifi                            2019.11.28         
cffi                               1.14.0             
chardet                            3.0.4              
chart-studio                       1.1.0              
Click                              7.0                
cloudpickle                        1.3.0              
clyent                             1.2.2              
colorama                           0.4.3              
colorlover                         0.3.0              
comtypes                           1.1.7              
conda                              4.8.2              
conda-build                        3.18.11            
conda-package-handling             1.6.0              
conda-verify                       3.4.2              
contextlib2                        0.6.0.post1        
cryptography                       2.8                
cycler                             0.10.0             
Cython                             0.29.15            
cytoolz                            0.10.1             
dask                               2.11.0             
decorator                          4.4.1              
defusedxml                         0.6.0              
diff-match-patch                   20181111           
distributed                        2.11.0             
docutils                           0.16               
entrypoints                        0.3                
et-xmlfile                         1.0.1              
fastcache                          1.1.0              
filelock                           3.0.12             
flake8                             3.7.9              
Flask                              1.1.1              
fsspec                             0.6.2              
future                             0.18.2             
gevent                             1.4.0              
glob2                              0.7                
graphviz                           0.19               
greenlet                           0.4.15             
h5py                               2.10.0             
HeapDict                           1.0.1              
html5lib                           1.0.1              
hypothesis                         5.5.4              
idna                               2.8                
imageio                            2.6.1              
imagesize                          1.2.0              
importlib-metadata                 1.5.0              
intervaltree                       3.0.2              
ipykernel                          5.1.4              
ipython                            7.12.0             
ipython-genutils                   0.2.0              
ipywidgets                         7.5.1              
isort                              4.3.21             
itsdangerous                       1.1.0              
jdcal                              1.4.1              
jedi                               0.14.1             
Jinja2                             2.11.1             
joblib                             0.14.1             
json5                              0.9.1              
jsonschema                         3.2.0              
jupyter                            1.0.0              
jupyter-client                     5.3.4              
jupyter-console                    6.1.0              
jupyter-core                       4.6.1              
jupyterlab                         1.2.6              
jupyterlab-server                  1.0.6              
keyring                            21.1.0             
kiwisolver                         1.1.0              
lazy-object-proxy                  1.4.3              
libarchive-c                       2.8                
llvmlite                           0.31.0             
locket                             0.2.0              
lxml                               4.5.0              
MarkupSafe                         1.1.1              
matplotlib                         3.1.3              
mccabe                             0.6.1              
menuinst                           1.4.16             
mistune                            0.8.4              
mkl-fft                            1.0.15             
mkl-random                         1.1.0              
mkl-service                        2.3.0              
mlxtend                            0.19.0             
mock                               4.0.1              
more-itertools                     8.2.0              
mpmath                             1.1.0              
msgpack                            0.6.1              
multipledispatch                   0.6.0              
navigator-updater                  0.2.1              
nbconvert                          5.6.1              
nbformat                           5.0.4              
nbmerge                            0.0.4              
networkx                           2.4                
nltk                               3.4.5              
nose                               1.3.7              
notebook                           6.0.3              
numba                              0.48.0             
numexpr                            2.7.1              
numpy                              1.18.1             
numpydoc                           0.9.2              
olefile                            0.46               
openpyxl                           3.0.3              
packaging                          20.1               
pandas                             1.0.1              
pandasql                           0.7.3              
pandocfilters                      1.4.2              
paramiko                           2.7.1              
parso                              0.5.2              
partd                              1.1.0              
path                               13.1.0             
pathlib2                           2.3.5              
pathtools                          0.1.2              
patsy                              0.5.1              
pep8                               1.7.1              
pexpect                            4.8.0              
pickleshare                        0.7.5              
Pillow                             7.0.0              
pip                                20.0.2             
pkginfo                            1.5.0.1            
plotly                             5.5.0              
pluggy                             0.13.1             
ply                                3.11               
prometheus-client                  0.7.1              
prompt-toolkit                     3.0.3              
psutil                             5.6.7              
py                                 1.8.1              
pycodestyle                        2.5.0              
pycosat                            0.6.3              
pycparser                          2.19               
pycrypto                           2.6.1              
pycurl                             7.43.0.5           
pydocstyle                         4.0.1              
pyflakes                           2.1.1              
Pygments                           2.5.2              
pylint                             2.4.4              
PyNaCl                             1.3.0              
pyodbc                             4.0.0-unsupported  
pyOpenSSL                          19.1.0             
pyparsing                          2.4.6              
pyreadline                         2.1                
pyrsistent                         0.15.7             
PySocks                            1.7.1              
pytest                             5.3.5              
pytest-arraydiff                   0.3                
pytest-astropy                     0.8.0              
pytest-astropy-header              0.1.2              
pytest-doctestplus                 0.5.0              
pytest-openfiles                   0.4.0              
pytest-remotedata                  0.3.2              
python-dateutil                    2.8.1              
python-jsonrpc-server              0.3.4              
python-language-server             0.31.7             
pytz                               2019.3             
PyWavelets                         1.1.1              
pywin32                            227                
pywin32-ctypes                     0.2.0              
pywinpty                           0.5.7              
PyYAML                             5.3                
pyzmq                              18.1.1             
QDarkStyle                         2.8                
QtAwesome                          0.6.1              
qtconsole                          4.6.0              
QtPy                               1.9.0              
requests                           2.22.0             
retrying                           1.3.3              
rope                               0.16.0             
Rtree                              0.9.3              
ruamel-yaml                        0.15.87            
scikit-image                       0.16.2             
scikit-learn                       0.22.1             
scipy                              1.4.1              
seaborn                            0.10.0             
Send2Trash                         1.5.0              
setuptools                         45.2.0.post20200210
simplegeneric                      0.8.1              
singledispatch                     3.4.0.3            
six                                1.14.0             
snowballstemmer                    2.0.0              
sortedcollections                  1.1.2              
sortedcontainers                   2.1.0              
soupsieve                          1.9.5              
Sphinx                             2.4.0              
sphinxcontrib-applehelp            1.0.1              
sphinxcontrib-devhelp              1.0.1              
sphinxcontrib-htmlhelp             1.0.2              
sphinxcontrib-jsmath               1.0.1              
sphinxcontrib-qthelp               1.0.2              
sphinxcontrib-serializinghtml      1.1.3              
sphinxcontrib-websupport           1.2.0              
spyder                             4.0.1              
spyder-kernels                     1.8.1              
SQLAlchemy                         1.3.13             
statsmodels                        0.11.0             
sympy                              1.5.1              
tables                             3.6.1              
tblib                              1.6.0              
tenacity                           8.0.1              
terminado                          0.8.3              
testpath                           0.4.4              
toolz                              0.10.0             
tornado                            6.0.3              
tqdm                               4.42.1             
traitlets                          4.3.3              
ujson                              1.35               
unicodecsv                         0.14.1             
urllib3                            1.25.8             
watchdog                           0.10.2             
wcwidth                            0.1.8              
webencodings                       0.5.1              
Werkzeug                           1.0.0              
wheel                              0.34.2             
widgetsnbextension                 3.5.1              
win-inet-pton                      1.1.0              
win-unicode-console                0.5                
wincertstore                       0.2                
wrapt                              1.11.2             
xlrd                               1.2.0              
XlsxWriter                         1.2.7              
xlwings                            0.17.1             
xlwt                               1.3.0              
xmltodict                          0.12.0             
yapf                               0.28.0             
zict                               1.0.0              
zipp                               2.2.0              
Note: you may need to restart the kernel to use updated packages.
In [3]:
#导入数据
testSet = pd.read_table('testSet - 降维.txt',header=None)
testSet
Out[3]:
0 1
0 -1 -2
1 -1 0
2 0 0
3 2 1
4 0 1
In [4]:
#查看原始数据分布
plt.scatter(testSet.iloc[:,0],testSet.iloc[:,1],marker = '.',c='black');

1)去除平均值(去均值化)

In [5]:
dataSet = testSet
# 计算均值
meanVals = dataSet.mean(0)
meanVals
Out[5]:
0    0.0
1    0.0
dtype: float64
In [6]:
dataSet.head()
Out[6]:
0 1
0 -1 -2
1 -1 0
2 0 0
3 2 1
4 0 1
In [7]:
# 去均值化,均值变为0
# 方法:将每一列的每个数据都减去均值。
meanRemoved = dataSet - meanVals
meanRemoved.head()
Out[7]:
0 1
0 -1.0 -2.0
1 -1.0 0.0
2 0.0 0.0
3 2.0 1.0
4 0.0 1.0
In [8]:
# 验证:计算均值
meanRemoved.mean(0)
Out[8]:
0    0.0
1    0.0
dtype: float64

Ps:如果这里出现 x.xxxe-xxx ,注意这就是一个很小很小的数值,可以视为0,因为计算机对浮点数有精度限制所以不是0,它是很小的浮点数值,我们认为是0即可。(不懂的话去看计算机组成原理 浮点数的表示)

2)计算协方差并将其变为矩阵

Ps:这里numpy的协方差计算函数np.cov有着很多参数,具体可以参考:https://blog.csdn.net/jeffery0207/article/details/83032325
rowvar:默认为True,此时每一行代表一个变量(属性),每一列代表一个观测;为False时,则反之
bias:默认为False,此时标准化时除以n-1;反之为n。其中n为观测数
为什么分母是n-1?——这是无偏估计,不懂的需要看下统计
要注意的是,本代码为了与文章一致性,在计算协方差时没有考虑无偏估计!

In [9]:
# 计算协方差矩阵,使用 numpy的cov函数
covMat = np.mat(np.cov(meanRemoved, rowvar=0, bias=True))
covMat
Out[9]:
matrix([[1.2, 0.8],
        [0.8, 1.2]])

3)计算协方差矩阵的特征值、特征向量

In [10]:
# 计算方差矩阵的特征值和右特征向量
eigVals,eigVects = np.linalg.eig(covMat)

# 返回特征值
eigVals
Out[10]:
array([2. , 0.4])
In [11]:
# P
eigVects
Out[11]:
matrix([[ 0.70710678, -0.70710678],
        [ 0.70710678,  0.70710678]])

4)对特征值排序并保留N个特征

In [12]:
# 对特征值排序,.argsort()函数默认从小到大排序
eigValInd = np.argsort(eigVals)

# 提取出最大的N个特征
N=1
eigValInd = eigValInd[:-(N+1):-1]
eigValInd
Out[12]:
array([0], dtype=int64)
In [13]:
# 取出特征向量的第1列(从0开始计数)
redEigVects = eigVects[:,eigValInd]
redEigVects
Out[13]:
matrix([[0.70710678],
        [0.70710678]])

5)转换到新空间

In [14]:
# 去除均值后的数据矩阵*压缩矩阵 --> 转换到新的空间
lowDDataMat = np.mat(meanRemoved) * redEigVects #注意此处meanRemoved要转换成矩阵
lowDDataMat
Out[14]:
matrix([[-2.12132034],
        [-0.70710678],
        [ 0.        ],
        [ 2.12132034],
        [ 0.70710678]])
In [ ]: