# import the library for data manipulation  
import pandas as pd

# import the data set. Change the path to the location of your data file
# the path to my Mac
# bm = pd.read_excel (r'/Users/cyu/Documents/body_measurements.xlsx')
# the path to my Windows
bm = pd.read_excel (r'G:\JUMP\553\Lecture_PowerPoint\Unit_09_PCA\Python_files\body_measurements.xlsx')

# preview the datafile to verify
print(bm)

# import the library for PCA
from sklearn.decomposition import PCA

# Run PCA analysis
pca = PCA().fit(bm)

# import the libraries for numeric manipulation and data visualization
import numpy as np
import matplotlib.pyplot as plt

# Plot the cumulative variance explained
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Variance explained')
plt.show()


# Fit a 4-component model

pca = PCA(n_components=4)
pca.fit(bm)

# Create a heatmap to see the component loadings of items
import seaborn as sns
heatmap = pd.DataFrame(pca.components_)
x_axis_labels = ["Mass","Fore","Bicep","Chest","Neck","Shoulder","Waist","Height","Calf","Thigh","Head"]
sns.heatmap(heatmap, cmap='twilight', annot=True, xticklabels=x_axis_labels)