import numpy as np
#(1)零均值化 def zeroMean(dataMat): meanVal=np.mean(dataMat,axis=0) newData =dataMat -meanVal return newData, meanVal #3、选择主成分个数 def percentage2n(eigVals,percentage): sortArray=np.sort(eigVals) #升序 sortArray=sortArray[-1::-1] #逆转,即降序 arraySum=sum(sortArray) tmpSum=0 num=0 for i in sortArray: tmpSum+=i num+=1 if tmpSum>=arraySum*percentage: return num #pca算法 def pca(dataMat,percentage=0.99): # (1)零均值化 newData, meanVal = zeroMean(dataMat) # 求协方差矩阵 covMat = np.cov(newData, rowvar=0) # (3)求特征值、特征矩阵 eigVals, eigVects = np.linalg.eig(np.mat(covMat)) n =percentage2n(eigVals,percentage) # eigVals 特征值和eigVects特征向量 eigValIndice = np.argsort(eigVals) #所以eigValIndice[-1:-(n+1):-1]就取出这个n个特征值对应的下标。【python里面,list[a:b:c]代表从下标a开始到b,步长为c。】 n_eigValIndice = eigValIndice[-1:-(n + 1):-1] # 最大的n个特征值的下标 n_eigVect = eigVects[:, n_eigValIndice] # 最大的n个特征值对应的特征向量 lowDDataMat = newData * n_eigVect # 低维特征空间的数据 reconMat = (lowDDataMat * n_eigVect.T) + meanVal # 重构数据 return lowDDataMat, reconMat def main(): data = [[10.2352,11.322], [10.1223,11.811], [9.1902,8.9049], [9.3064,9.8474], [8.3301,8.3404], [10.1528,10.1235], [10.4085,10.822], [9.0036,10.0392], [9.5349,10.097], [9.4982,10.8254]] lowDDataMat, reconMat = pca(data,0.9) print(lowDDataMat) if __name__=="__main__": main()