# import the data set Diabetes. Change the path to the location of your data file 
import pandas as pd
# File location in my Mac
# df = pd.read_excel (r'/Users/cyu/Documents/Diabetes.xlsx') 
# File location in my Windows
df = pd.read_excel (r'G:\JUMP\553\Lecture_PowerPoint\Unit_08_cluster_analysis\Python_files\Diabetes.xlsx')

# put the variables into an array object
import numpy as np
Profile = np.array(list(zip(df.Age, df.BMI, df.BP, df.HDL, df.TCH, df.LTG, df. Glucose)))

# import the libraries for k-mean clustering, spatial metrics, and plotting
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt

# initialize the value of distortion as blank
distortions = []
# determine k using a loop
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(Profile)
    kmeanModel.fit(Profile)
    distortions.append(sum(np.min(cdist(Profile, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / Profile.shape[0])

# Plot the elbow to show the optimal k
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow graph showing the optimal k')
plt.show()