The purpose of this program is to:
- read a dataset: the rows are customers, the columns are products customers buy
- apply Principal Components Analysis to reduce the number of features
- apply k-means to determine the cluster which each customer belongs to
- do steps 1, 2 on a new dataset that has the same structure as the original one, but different values
- apply the k-means model determined at step 3 to the new dataset
The issue is that repeated runs give different results in terms of which cluster a customer belongs to. There must be a bug that I am unable to find. Thanks in advance.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
def get_kmeans_score(data, center):
'''
returns the kmeans score regarding SSE for points to centers
INPUT:
data - the dataset you want to fit kmeans to
center - the number of centers you want (the k value)
OUTPUT:
score - the SSE score for the kmeans model fit to the data
'''
#instantiate kmeans
kmeans = KMeans(n_clusters=center)
# Then fit the model to your data using the fit method
model = kmeans.fit(data)
# Obtain a score related to the model fit
score = np.abs(model.score(data))
return score
data = {
'apples': [3, 2, 0, 9, 2, 1],
'oranges': [0, 7.6, 7, 2, 7, 6],
'figs':[1.4, 11, 10.999, 3.99, 10, 2],
'pears': [5, 2, 6, 2.45, 1, 7],
'berries': [1.3, 4, 10, 0, 5,21],
'tomatoes': [5, 15, 3, 4, 17,5],
'onions': [11,3, 3, 1, 0, 10]
}
purchases = pd.DataFrame(data, index=['June', 'Robert', 'Lily', 'David', 'Bob', 'Karen'])
print('ORIGINAL DATA')
print(purchases)
Y1 = pd.DataFrame(np.round(purchases,0), columns = purchases.keys())
scaler = StandardScaler()
Y = scaler.fit_transform(Y1)
pca = PCA(n_components=3)
W = pca.fit_transform(Y)
# apply k-means
scores = []
centers = list(range(1,5))
for center in centers:
scores.append(get_kmeans_score(W, center))
X = zip(centers, scores)
print('k-means results on original data as a function of # centers')
for i in X:
print(i)
# from the above results, assume the elbow is 4 clusters
print('_________________________________________')
n_c = 4
#kmeans = KMeans(n_clusters=4, random_state=int)
kmeans = KMeans(n_clusters=4)
model = kmeans.fit(W)
score = np.abs(model.score(W))
print('k-means score on ', n_c, ' clusters for the original dataset = ',score)
# model is the k-means model that will also be applied to the new dataset
#
NEW_data = {
'apples': [9, 20, 10, 2, 12,1],
'oranges': [10, 3, 12, 1, 18, 5],
'figs':[34, 11, 3.999, 1, 0, 12],
'pears': [5, 2, 16, 2.45, 10, 11],
'berries': [13, 4, 1, 2, 15, 4],
'tomatoes': [7, 2, 1, 14, 27, 2],
'onions': [1,10, 11, 2, 4, 10]
}
purchases_N = pd.DataFrame(NEW_data)
purchases_N = pd.DataFrame(NEW_data, index=['June', 'Robert', 'Lily', 'David', 'Bob', 'Karen'])
print('NEW DATA')
print(purchases_N)
YY1 = pd.DataFrame(np.round(purchases_N,0), columns = purchases_N.keys())
YY = scaler.fit_transform(YY1)
W1 = pca.transform(YY)
scoreNew = np.abs(model.score(W1))
print('k-means score on ', n_c, ' clusters for the new dataset = ',scoreNew)
print(scoreNew)
# k-means score the new dataset using the model determined on original ds
# predictions for the 2 datasets using the k-means model based on orig data
predict_purchases_dataset = model.predict(W)
predict_purchases_NewDataset = model.predict(W1)
print('original data upon PCA using n_components=3')
print(W)
print('k-means predictions --- original data')
print(predict_purchases_dataset)
print('_________________________________________')
print('new data upon PCA using n_components=3')
print(W1)
print('k-means predictions --- new data')
print(predict_purchases_NewDataset)
# the output matches the prediction on orig dataset:
# there are 2 customers in cluster 2, 2 customers in cluster 1, 1 in cluster 3 and 1 in 0
L = len(purchases.index)
x = [i for i in range (10)]
orig = []
NEW = []
for i in range(10):
orig.append((predict_purchases_dataset== i).sum()/L)
NEW.append((predict_purchases_NewDataset== i).sum()/L)
print('proportion of k-means clusters for original data')
print(orig)
print('proportion of k-means clusters for new data')
print(NEW)
#df_summary = pd.DataFrame({'cluster' : x, 'propotion_orig' : orig, 'proportion_NEW': NEW})
#df_summary.plot(x='cluster', y= ['propotion_orig','proportion_NEW' ], kind='bar')
model.cluster_centers_
#
IPCA = pca.inverse_transform(model.cluster_centers_)
APPROX = scaler.inverse_transform(IPCA)
approx_df =pd.DataFrame(APPROX, columns=purchases.columns)
print('k-means centers coordinates in original features space')
print('k-means centers coordinates in original features space')
print(approx_df)
FIRST RUN
k-means predictions --- original data
[3 1 0 2 1 0]
k-means predictions --- new data
[1 2 0 1 1 0]
SECOND RUN
k-means predictions --- original data
[1 2 0 3 2 0]
k-means predictions --- new data
[2 3 0 2 2 0]