IntroML/PCA/pca.py at master · b2net/IntroML · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# tested on Python 3.6.3
# work dir must contain: close_prices.csv, djia_index.csv
# performs PCA and gets insights about components

import pandas     # http://pandas.pydata.org/
import numpy      # http://www.numpy.org/
#import os         # https://docs.python.org/3/library/os.html

from sklearn.decomposition import PCA # http://scikit-learn.org/stable/

# set cd
#os.chdir('D:\Programming\Python\IntroML\PCA')

# load data from csv
data = pandas.read_csv('close_prices.csv').iloc[:, 1:32]
dj   = pandas.read_csv('djia_index.csv').iloc[:,1]

# fit PCA with 10 components
pca = PCA(10)
pca.fit(data)

# find a company with the highest weight in the 1st component
maxWeightInd = pca.components_[0].argmax()
print('Company with the highest impact to the 1st component is', \
      list(data)[maxWeightInd])

# define sufficient number of components for 90% of explained varience
totRat = 0
for i in range(len(pca.explained_variance_ratio_)):
    totRat += pca.explained_variance_ratio_[i]
    if totRat>=0.90:
        print('Sufficient number of components for 90% e.v. =',i+1)
        break

# apply the model to data
result = pca.transform(data)

# calculate Pearson corr between the 1st component and DJ index
comp1 = result[:,0]
print('Pearson corr between the 1st component and DJ index =', \
      round(numpy.corrcoef(comp1,dj)[0,1],2))