#导入相应包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pip list
Package Version ---------------------------------- ------------------- alabaster 0.7.12 anaconda-client 1.7.2 anaconda-navigator 1.9.12 anaconda-project 0.8.3 argh 0.26.2 asn1crypto 1.3.0 astroid 2.3.3 astropy 4.0 atomicwrites 1.3.0 attrs 19.3.0 autopep8 1.4.4 Babel 2.8.0 backcall 0.1.0 backports.functools-lru-cache 1.6.1 backports.shutil-get-terminal-size 1.0.0 backports.tempfile 1.0 backports.weakref 1.0.post1 bcrypt 3.1.7 beautifulsoup4 4.8.2 bitarray 1.2.1 bkcharts 0.2 bleach 3.1.0 bokeh 1.4.0 boto 2.49.0 Bottleneck 1.3.2 certifi 2019.11.28 cffi 1.14.0 chardet 3.0.4 chart-studio 1.1.0 Click 7.0 cloudpickle 1.3.0 clyent 1.2.2 colorama 0.4.3 colorlover 0.3.0 comtypes 1.1.7 conda 4.8.2 conda-build 3.18.11 conda-package-handling 1.6.0 conda-verify 3.4.2 contextlib2 0.6.0.post1 cryptography 2.8 cycler 0.10.0 Cython 0.29.15 cytoolz 0.10.1 dask 2.11.0 decorator 4.4.1 defusedxml 0.6.0 diff-match-patch 20181111 distributed 2.11.0 docutils 0.16 entrypoints 0.3 et-xmlfile 1.0.1 fastcache 1.1.0 filelock 3.0.12 flake8 3.7.9 Flask 1.1.1 fsspec 0.6.2 future 0.18.2 gevent 1.4.0 glob2 0.7 graphviz 0.19 greenlet 0.4.15 h5py 2.10.0 HeapDict 1.0.1 html5lib 1.0.1 hypothesis 5.5.4 idna 2.8 imageio 2.6.1 imagesize 1.2.0 importlib-metadata 1.5.0 intervaltree 3.0.2 ipykernel 5.1.4 ipython 7.12.0 ipython-genutils 0.2.0 ipywidgets 7.5.1 isort 4.3.21 itsdangerous 1.1.0 jdcal 1.4.1 jedi 0.14.1 Jinja2 2.11.1 joblib 0.14.1 json5 0.9.1 jsonschema 3.2.0 jupyter 1.0.0 jupyter-client 5.3.4 jupyter-console 6.1.0 jupyter-core 4.6.1 jupyterlab 1.2.6 jupyterlab-server 1.0.6 keyring 21.1.0 kiwisolver 1.1.0 lazy-object-proxy 1.4.3 libarchive-c 2.8 llvmlite 0.31.0 locket 0.2.0 lxml 4.5.0 MarkupSafe 1.1.1 matplotlib 3.1.3 mccabe 0.6.1 menuinst 1.4.16 mistune 0.8.4 mkl-fft 1.0.15 mkl-random 1.1.0 mkl-service 2.3.0 mlxtend 0.19.0 mock 4.0.1 more-itertools 8.2.0 mpmath 1.1.0 msgpack 0.6.1 multipledispatch 0.6.0 navigator-updater 0.2.1 nbconvert 5.6.1 nbformat 5.0.4 nbmerge 0.0.4 networkx 2.4 nltk 3.4.5 nose 1.3.7 notebook 6.0.3 numba 0.48.0 numexpr 2.7.1 numpy 1.18.1 numpydoc 0.9.2 olefile 0.46 openpyxl 3.0.3 packaging 20.1 pandas 1.0.1 pandasql 0.7.3 pandocfilters 1.4.2 paramiko 2.7.1 parso 0.5.2 partd 1.1.0 path 13.1.0 pathlib2 2.3.5 pathtools 0.1.2 patsy 0.5.1 pep8 1.7.1 pexpect 4.8.0 pickleshare 0.7.5 Pillow 7.0.0 pip 20.0.2 pkginfo 1.5.0.1 plotly 5.5.0 pluggy 0.13.1 ply 3.11 prometheus-client 0.7.1 prompt-toolkit 3.0.3 psutil 5.6.7 py 1.8.1 pycodestyle 2.5.0 pycosat 0.6.3 pycparser 2.19 pycrypto 2.6.1 pycurl 7.43.0.5 pydocstyle 4.0.1 pyflakes 2.1.1 Pygments 2.5.2 pylint 2.4.4 PyNaCl 1.3.0 pyodbc 4.0.0-unsupported pyOpenSSL 19.1.0 pyparsing 2.4.6 pyreadline 2.1 pyrsistent 0.15.7 PySocks 1.7.1 pytest 5.3.5 pytest-arraydiff 0.3 pytest-astropy 0.8.0 pytest-astropy-header 0.1.2 pytest-doctestplus 0.5.0 pytest-openfiles 0.4.0 pytest-remotedata 0.3.2 python-dateutil 2.8.1 python-jsonrpc-server 0.3.4 python-language-server 0.31.7 pytz 2019.3 PyWavelets 1.1.1 pywin32 227 pywin32-ctypes 0.2.0 pywinpty 0.5.7 PyYAML 5.3 pyzmq 18.1.1 QDarkStyle 2.8 QtAwesome 0.6.1 qtconsole 4.6.0 QtPy 1.9.0 requests 2.22.0 retrying 1.3.3 rope 0.16.0 Rtree 0.9.3 ruamel-yaml 0.15.87 scikit-image 0.16.2 scikit-learn 0.22.1 scipy 1.4.1 seaborn 0.10.0 Send2Trash 1.5.0 setuptools 45.2.0.post20200210 simplegeneric 0.8.1 singledispatch 3.4.0.3 six 1.14.0 snowballstemmer 2.0.0 sortedcollections 1.1.2 sortedcontainers 2.1.0 soupsieve 1.9.5 Sphinx 2.4.0 sphinxcontrib-applehelp 1.0.1 sphinxcontrib-devhelp 1.0.1 sphinxcontrib-htmlhelp 1.0.2 sphinxcontrib-jsmath 1.0.1 sphinxcontrib-qthelp 1.0.2 sphinxcontrib-serializinghtml 1.1.3 sphinxcontrib-websupport 1.2.0 spyder 4.0.1 spyder-kernels 1.8.1 SQLAlchemy 1.3.13 statsmodels 0.11.0 sympy 1.5.1 tables 3.6.1 tblib 1.6.0 tenacity 8.0.1 terminado 0.8.3 testpath 0.4.4 toolz 0.10.0 tornado 6.0.3 tqdm 4.42.1 traitlets 4.3.3 ujson 1.35 unicodecsv 0.14.1 urllib3 1.25.8 watchdog 0.10.2 wcwidth 0.1.8 webencodings 0.5.1 Werkzeug 1.0.0 wheel 0.34.2 widgetsnbextension 3.5.1 win-inet-pton 1.1.0 win-unicode-console 0.5 wincertstore 0.2 wrapt 1.11.2 xlrd 1.2.0 XlsxWriter 1.2.7 xlwings 0.17.1 xlwt 1.3.0 xmltodict 0.12.0 yapf 0.28.0 zict 1.0.0 zipp 2.2.0 Note: you may need to restart the kernel to use updated packages.
#导入数据
testSet = pd.read_table('testSet - 降维.txt',header=None)
testSet
0 | 1 | |
---|---|---|
0 | -1 | -2 |
1 | -1 | 0 |
2 | 0 | 0 |
3 | 2 | 1 |
4 | 0 | 1 |
#查看原始数据分布
plt.scatter(testSet.iloc[:,0],testSet.iloc[:,1],marker = '.',c='black');
dataSet = testSet
# 计算均值
meanVals = dataSet.mean(0)
meanVals
0 0.0 1 0.0 dtype: float64
dataSet.head()
0 | 1 | |
---|---|---|
0 | -1 | -2 |
1 | -1 | 0 |
2 | 0 | 0 |
3 | 2 | 1 |
4 | 0 | 1 |
# 去均值化,均值变为0
# 方法:将每一列的每个数据都减去均值。
meanRemoved = dataSet - meanVals
meanRemoved.head()
0 | 1 | |
---|---|---|
0 | -1.0 | -2.0 |
1 | -1.0 | 0.0 |
2 | 0.0 | 0.0 |
3 | 2.0 | 1.0 |
4 | 0.0 | 1.0 |
# 验证:计算均值
meanRemoved.mean(0)
0 0.0 1 0.0 dtype: float64
Ps:如果这里出现 x.xxxe-xxx ,注意这就是一个很小很小的数值,可以视为0,因为计算机对浮点数有精度限制所以不是0,它是很小的浮点数值,我们认为是0即可。(不懂的话去看计算机组成原理 浮点数的表示)
Ps:这里numpy的协方差计算函数np.cov有着很多参数,具体可以参考:https://blog.csdn.net/jeffery0207/article/details/83032325
rowvar:默认为True,此时每一行代表一个变量(属性),每一列代表一个观测;为False时,则反之
bias:默认为False,此时标准化时除以n-1;反之为n。其中n为观测数
为什么分母是n-1?——这是无偏估计,不懂的需要看下统计
要注意的是,本代码为了与文章一致性,在计算协方差时没有考虑无偏估计!
# 计算协方差矩阵,使用 numpy的cov函数
covMat = np.mat(np.cov(meanRemoved, rowvar=0, bias=True))
covMat
matrix([[1.2, 0.8], [0.8, 1.2]])
# 计算方差矩阵的特征值和右特征向量
eigVals,eigVects = np.linalg.eig(covMat)
# 返回特征值
eigVals
array([2. , 0.4])
# P
eigVects
matrix([[ 0.70710678, -0.70710678], [ 0.70710678, 0.70710678]])
# 对特征值排序,.argsort()函数默认从小到大排序
eigValInd = np.argsort(eigVals)
# 提取出最大的N个特征
N=1
eigValInd = eigValInd[:-(N+1):-1]
eigValInd
array([0], dtype=int64)
# 取出特征向量的第1列(从0开始计数)
redEigVects = eigVects[:,eigValInd]
redEigVects
matrix([[0.70710678], [0.70710678]])
# 去除均值后的数据矩阵*压缩矩阵 --> 转换到新的空间
lowDDataMat = np.mat(meanRemoved) * redEigVects #注意此处meanRemoved要转换成矩阵
lowDDataMat
matrix([[-2.12132034], [-0.70710678], [ 0. ], [ 2.12132034], [ 0.70710678]])