# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download a file based on its file ID.
#
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz
file_id = '1BRvg_ZubjFpciAhFA_Qaj1EZkaIWOITs'
downloaded = drive.CreateFile({'id': file_id})
# print('Downloaded content "{}"'.format(downloaded.GetContentString()))
downloaded.GetContentFile('Online Retail.csv')

 import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
from datetime import timedelta
# import warnings
# warnings.simplefilter(action='ignore', category='SettingWithCopyWarning')
sns.set()

 from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, silhouette_samples
from scipy.spatial.distance import cdist
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

 def check_values(df):
    col_desc = []
    data = {
        'features': [col for col in df.columns],
        'data_type': [df[col].dtype for col in df.columns],
        'nan_total': [df[col].isna().sum() for col in df.columns],
        'nan_pct': [round(df[col].isna().sum()/len(df)*100,2) for col in df.columns],
        'unique': [df[col].nunique() for col in df.columns],
        'values_ex': [df[col].drop_duplicates().sample(df[col].nunique()).values if df[col].nunique() <= 5 else df[col].drop_duplicates().sample(2).values for col in df.columns]
    }
    return pd.DataFrame(data)

 %%time
# filepath = r'C:\Users\azuka\Drive Folder\ML projects\Leapsapp\customer segmentation kmeans\Online Retail.csv'
data = pd.read_csv('Online Retail.csv')
data

CPU times: user 607 ms, sys: 28 ms, total: 635 ms
Wall time: 649 ms

 %%time
check_values(data)

CPU times: user 766 ms, sys: 9.17 ms, total: 776 ms
Wall time: 788 ms

 data.describe()

 data[data.InvoiceNo.str.startswith('C')]

 %%time
df_clean = data.dropna(subset=['CustomerID'])
cancelled = df_clean[df_clean.InvoiceNo.str.startswith('C')].index
df_clean = df_clean.drop(index=cancelled)
df_clean.loc[:,'Date'] = pd.to_datetime(df_clean['InvoiceDate'])
df_clean.loc[:,'TotalSum'] = df_clean['Quantity'] * df_clean['UnitPrice']
df_clean[['Quantity', 'UnitPrice', 'TotalSum']].head()

CPU times: user 2.85 s, sys: 29.3 ms, total: 2.88 s
Wall time: 2.92 s

 print('df_clean length:',len(df_clean))

df_clean length: 397924

 %%time
check_values(df_clean)

CPU times: user 599 ms, sys: 7.9 ms, total: 607 ms
Wall time: 609 ms

 record_date = df_clean.Date.max() + timedelta(days=1)

# Calculate Recency, Frequency and Monetary value for each customer
df_rfm = df_clean.groupby(['CustomerID']).agg({
    'Date': lambda x: (record_date - x.max()).days,
    'InvoiceNo': 'count',
    'TotalSum': 'sum'})

df_rfm.rename(columns={
    'Date':'RecentTrans',
    'InvoiceNo':'Frequency',
    'TotalSum':'MonetaryValue'}, inplace=True)

df_rfm

 %%time
fig, ax = plt.subplots(1,3, figsize=(12,4))
for i, col in enumerate(df_rfm.columns):
    sns.histplot(df_rfm[col], ax=ax[i])
plt.show()

CPU times: user 5.73 s, sys: 61 ms, total: 5.79 s
Wall time: 5.81 s

 skew_columns = (df_rfm.skew().sort_values(ascending=False))

skew_columns = skew_columns.loc[skew_columns > 0.75]
skew_columns

MonetaryValue    19.326985
Frequency        18.106243
RecentTrans       1.917516
dtype: float64

 %%time
df_transf = df_rfm.copy()
for col in skew_columns.index.tolist():
    df_transf[col] = np.log1p(df_transf[col])


fig, ax = plt.subplots(1,3, figsize=(12,4))
for i, col in enumerate(df_transf.columns):
    sns.histplot(df_transf[col], ax=ax[i])
plt.suptitle('After Transformation')
plt.show()

CPU times: user 1.06 s, sys: 13.9 ms, total: 1.07 s
Wall time: 1.07 s

 (df_transf.skew().sort_values(ascending=False))

MonetaryValue    0.360562
Frequency       -0.015272
RecentTrans     -0.459898
dtype: float64

 scaler = StandardScaler()
df_train = df_transf.copy()
for col in df_transf.columns:
    df_train[col] = scaler.fit_transform(df_train[[col]])
df_train.describe().round(4)

 ### BEGIN SOLUTION
# Create and fit a range of models
km_list = []

for clust in range(2,12):
    km = KMeans(n_clusters=clust, random_state=26)
    km = km.fit(df_train)
    
    km_list.append(pd.Series({'clusters': clust, 
                              'inertia': km.inertia_,
                              'model': km}))

plot_data = (pd.concat(km_list, axis=1)
             .T
             [['clusters','inertia']]
             .set_index('clusters'))

ax = plot_data.plot(marker='o',ls='-')
ax.set_xticks(range(0,12,2))
ax.set_xlim(0,12)
ax.set_title('Clusters Number vs Inertia')
ax.set(xlabel='Cluster', ylabel='Inertia');
### END SOLUTION

 ### BEGIN SOLUTION
# Create and fit a range of models
km_list = []

for clust in range(2,12):
    km = KMeans(n_clusters=clust, random_state=26)
    km = km.fit(df_train)
    
    km_list.append(pd.Series({'clusters': clust, 
                              'distortion': sum(np.min(cdist(df_train, km.cluster_centers_,
                                        'euclidean'), axis=1)) / df_train.shape[0],
                              'model': km}))

plot_data = (pd.concat(km_list, axis=1)
             .T
             [['clusters','distortion']]
             .set_index('clusters'))

ax = plot_data.plot(marker='o',ls='-')
ax.set_xticks(range(0,12,2))
ax.set_xlim(0,12)
ax.set_title('Clusters Number vs Distortion')
ax.set(xlabel='Cluster', ylabel='Distortion');
### END SOLUTION

 def plot_silhouette_analysis(X):
    for n_clusters in range(2,12):
        # Create a subplot with 1 row and 2 columns
        fig, ax = plt.subplots(1, 1)
        fig.set_size_inches(8, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax.set_ylim([0, len(X) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        clusterer = KMeans(n_clusters=n_clusters, random_state=10)
        cluster_labels = clusterer.fit_predict(X)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X, cluster_labels)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)

        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n_clusters)
            ax.fill_betweenx(np.arange(y_lower, y_upper),
                              0, ith_cluster_silhouette_values,
                              facecolor=color, edgecolor=color, alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax.set_title("The silhouette plot for the various clusters.")
        ax.set_xlabel("The silhouette coefficient values")
        ax.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax.set_yticks([])  # Clear the yaxis labels / ticks
        ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                      "with n_clusters = %d" % n_clusters),
                    fontsize=14, fontweight='bold')

 plot_silhouette_analysis(df_train)
plt.show()

For n_clusters = 2 The average silhouette_score is : 0.3984490807520717
For n_clusters = 3 The average silhouette_score is : 0.2939590856452589
For n_clusters = 4 The average silhouette_score is : 0.30291276600135514
For n_clusters = 5 The average silhouette_score is : 0.27245225104697063
For n_clusters = 6 The average silhouette_score is : 0.2737323249390449
For n_clusters = 7 The average silhouette_score is : 0.25892865078041044
For n_clusters = 8 The average silhouette_score is : 0.26337085947019967
For n_clusters = 9 The average silhouette_score is : 0.2546326149342129
For n_clusters = 10 The average silhouette_score is : 0.26142348677931965
For n_clusters = 11 The average silhouette_score is : 0.2618187678581975

 def kmeans(df, clusters_number):
    '''
    Implement k-means clustering on dataset
    
    INPUT:
        dataset : dataframe. Dataset for k-means to fit.
        clusters_number : int. Number of clusters to form.
        end : int. Ending range of kmeans to test.
    OUTPUT:
        Cluster results and t-SNE visualisation of clusters.
    '''
    
    kmeans = KMeans(n_clusters = clusters_number, random_state = 1)
    kmeans.fit(df)

    # Extract cluster labels
    cluster_labels = kmeans.labels_
        
    # Create a cluster label column in original dataset
    df_new = df.assign(Cluster = cluster_labels)
    
    # Initialise TSNE
    model = TSNE(random_state=1)
    transformed = model.fit_transform(df)
    
    # Plot t-SNE
    plt.title('Flattened Graph of {} Clusters'.format(clusters_number))
    sns.scatterplot(x=transformed[:,0], y=transformed[:,1], hue=cluster_labels, style=cluster_labels, palette="Set1")
    
    return df_new, cluster_labels

 def plot_ag(df, clusters_number, df_agg):
       
    # kmeans = KMeans(n_clusters = clusters_number, random_state = 1)
    # kmeans.fit(df)

    # # Extract cluster labels
    # cluster_labels = kmeans.labels_
        
    # # Create a cluster label column in original dataset
    # df_new = df.assign(Cluster = cluster_labels)
    
    # Initialise TSNE
    cluster_labels = df_agg['Cluster']

    model = TSNE(random_state=1)
    transformed = model.fit_transform(df)
    
    # Plot t-SNE
    plt.title('Flattened Graph of {} Clusters'.format(clusters_number))
    sns.scatterplot(x=transformed[:,0], y=transformed[:,1], hue=cluster_labels, style=cluster_labels, palette="Set1")

 %%time
df_km_2, labels_2 = kmeans(df_train, 2)
df_label_2 = df_rfm.assign(Cluster = labels_2)
plt.title('Flattened Graph of 2 Clusters with KMeans')
plt.show()

CPU times: user 1min 3s, sys: 410 ms, total: 1min 3s
Wall time: 32.6 s

 ag = AgglomerativeClustering(n_clusters=2, linkage='ward')
ag = ag.fit(df_train)
cluster_labels = ag.fit_predict(df_train)
df_agg_2 = df_train.assign(Cluster = cluster_labels)

 plot_ag(df_train, clusters_number=2, df_agg=df_agg_2)
plt.title('Flattened Graph of 2 Clusters with AgglomerativeClustering')
plt.show()

 %%time
df_km_3, labels_3 = kmeans(df_train, 3)
df_label_3 = df_rfm.assign(Cluster = labels_3)
plt.title('Flattened Graph of 3 Clusters with KMeans')
plt.show()

CPU times: user 1min 2s, sys: 496 ms, total: 1min 3s
Wall time: 32.2 s

 ag = AgglomerativeClustering(n_clusters=3, linkage='ward')
ag = ag.fit(df_train)
cluster_labels = ag.fit_predict(df_train)
df_agg_3 = df_train.assign(Cluster = cluster_labels)

 plot_ag(df_train, clusters_number=3, df_agg=df_agg_3)
plt.title('Flattened Graph of 3 Clusters with AgglomerativeClustering')
plt.show()

 %%time
df_km_4, labels_4 = kmeans(df_train, 4)
df_label_4 = df_rfm.assign(Cluster = labels_4)
plt.title('Flattened Graph of 4 Clusters with KMeans')
plt.show()

CPU times: user 1min 3s, sys: 412 ms, total: 1min 3s
Wall time: 32.6 s

 ag = AgglomerativeClustering(n_clusters=4, linkage='ward')
ag = ag.fit(df_train)
cluster_labels = ag.fit_predict(df_train)
df_agg_4 = df_train.assign(Cluster = cluster_labels)

 %%time
plot_ag(df_train, clusters_number=4, df_agg=df_agg_4)
plt.title('Flattened Graph of 4 Clusters with AgglomerativeClustering')
plt.show()

CPU times: user 1min 2s, sys: 273 ms, total: 1min 2s
Wall time: 32.1 s

 def snake_plot(normalised_df_rfm, df_rfm_kmeans, df_rfm_original=df_rfm):
    '''
    Transform dataframe and plot snakeplot
    '''
    # Transform df_normal as df and add cluster column
    normalised_df_rfm = pd.DataFrame(normalised_df_rfm, 
                                       index=df_rfm_original.index, 
                                       columns=df_rfm_original.columns)
    normalised_df_rfm['Cluster'] = df_rfm_kmeans['Cluster']

    # Melt data into long format
    df_melt = pd.melt(normalised_df_rfm.reset_index(), 
                        id_vars=['CustomerID', 'Cluster'],
                        value_vars=['RecentTrans', 'Frequency', 'MonetaryValue'], 
                        var_name='Metric', 
                        value_name='Value')

    plt.xlabel('Metric')
    plt.ylabel('Value')
    
    return sns.pointplot(data=df_melt, x='Metric', y='Value', hue='Cluster')

 %%time
fig, ax = plt.subplots(1,3, figsize=(16,4))
plt.subplot(1,3,1)
ax[0]=snake_plot(df_train, df_km_2)
ax[0].set_title('cluster = 2')
plt.subplot(1,3,2)
ax[1]=snake_plot(df_train, df_km_3)
ax[1].set_title('cluster = 3')
plt.subplot(1,3,3)
ax[2]=snake_plot(df_train, df_km_4)
ax[2].set_title('cluster = 4')
plt.suptitle('KMeans')
plt.show()

CPU times: user 1.91 s, sys: 17.4 ms, total: 1.92 s
Wall time: 1.93 s

 def rfm_values(df):
    '''
    Calcualte average RFM values and size for each cluster

    '''
    df_new = df.groupby(['Cluster']).agg({
        'RecentTrans': 'mean',
        'Frequency': 'mean',
        'MonetaryValue': ['mean', 'count']
    }).round(0)
    
    return df_new

 rfm_values(df_label_3)

	features	data_type	nan_total	nan_pct	unique	values_ex
0	InvoiceNo	object	0	0.00	25900	[553565, 575490]
1	StockCode	object	0	0.00	4070	[72132, 84801B]
2	Description	object	1454	0.27	4223	[OFFICE MUG WARMER PINK, crushed boxes]
3	Quantity	int64	0	0.00	722	[768, 99]
4	InvoiceDate	object	0	0.00	23260	[29/09/2011 9:53, 04/02/2011 14:04]
5	UnitPrice	float64	0	0.00	1630	[448.0, 732.69]
6	CustomerID	float64	135080	24.93	4372	[16094.0, 17928.0]
7	Country	object	0	0.00	38	[Lithuania, Australia]

	Quantity	UnitPrice	CustomerID
count	541909.000000	541909.000000	406829.000000
mean	9.552250	4.611114	15287.690570
std	218.081158	96.759853	1713.600303
min	-80995.000000	-11062.060000	12346.000000
25%	1.000000	1.250000	13953.000000
50%	3.000000	2.080000	15152.000000
75%	10.000000	4.130000	16791.000000
max	80995.000000	38970.000000	18287.000000

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
141	C536379	D	Discount	-1	01/12/2010 9:41	27.50	14527.0	United Kingdom
154	C536383	35004C	SET OF 3 COLOURED FLYING DUCKS	-1	01/12/2010 9:49	4.65	15311.0	United Kingdom
235	C536391	22556	PLASTERS IN TIN CIRCUS PARADE	-12	01/12/2010 10:24	1.65	17548.0	United Kingdom
236	C536391	21984	PACK OF 12 PINK PAISLEY TISSUES	-24	01/12/2010 10:24	0.29	17548.0	United Kingdom
237	C536391	21983	PACK OF 12 BLUE PAISLEY TISSUES	-24	01/12/2010 10:24	0.29	17548.0	United Kingdom
...	...	...	...	...	...	...	...	...
540449	C581490	23144	ZINC T-LIGHT HOLDER STARS SMALL	-11	09/12/2011 9:57	0.83	14397.0	United Kingdom
541541	C581499	M	Manual	-1	09/12/2011 10:28	224.69	15498.0	United Kingdom
541715	C581568	21258	VICTORIAN SEWING BOX LARGE	-5	09/12/2011 11:57	10.95	15311.0	United Kingdom
541716	C581569	84978	HANGING HEART JAR T-LIGHT HOLDER	-1	09/12/2011 11:58	1.25	17315.0	United Kingdom
541717	C581569	20979	36 PENCILS TUBE RED RETROSPOT	-5	09/12/2011 11:58	1.25	17315.0	United Kingdom

	RecentTrans	Frequency	MonetaryValue
CustomerID
12346.0	327	1	77183.60
12347.0	41	182	4310.00
12348.0	77	31	1797.24
12349.0	20	73	1757.55
12350.0	312	17	334.40
...	...	...	...
18280.0	161	10	180.60
18281.0	5	7	80.82
18282.0	217	12	178.05
18283.0	11	756	2094.88
18287.0	1	70	1837.28

	RecentTrans	Frequency	MonetaryValue
count	4339.0000	4339.0000	4339.0000
mean	0.0000	-0.0000	0.0000
std	1.0001	1.0001	1.0001
min	-2.6679	-2.4328	-5.2266
25%	-0.6904	-0.6764	-0.6828
50%	0.0776	0.0009	-0.0608
75%	0.8365	0.7023	0.6534
max	1.9926	4.1818	4.7184

Online Retail Customer Segmentation¶

1. Dataset Summary¶

2. Goals¶

3. Data Cleaning and Feature Engineering¶

Data Cleaning¶

Feature Engineering¶

4. EDA¶

Log Transformation¶

5. Modelling¶

Search for optimal number of clusters¶

Clusters number vs Inertia¶

Clusters number vs Distortion¶

Clusters number vs Silhouette Score¶

KMeans vs Hierarchical Agglomerative Clustering¶

Clusters Number = 2, (KMeans (top) vs AgglomerativeClustering (bottom))¶

Clusters Number = 3, (KMeans (top) vs AgglomerativeClustering (bottom))¶

Clusters Number = 4, (KMeans (top) vs AgglomerativeClustering (bottom))¶

6. Customer Segmentation Result¶

7. Conclusion and Suggestion¶

	features	data_type	unique	values_ex
0	InvoiceNo	object	18536	[566633, 556792]
1	StockCode	object	3665	[21191, 16238]
2	Description	object	3877	[PENCIL CASE LIFE IS BEAUTIFUL, DOORSTOP FOOTB...
3	Quantity	int64	302	[608, 407]
4	InvoiceDate	object	17286	[07/07/2011 10:28, 04/12/2011 12:18]
5	UnitPrice	float64	441	[4.98, 1.93]
6	CustomerID	float64	4339	[13607.0, 13292.0]
7	Country	object	37	[Saudi Arabia, United Arab Emirates]
8	Date	datetime64[ns]	17286	[2011-10-13T12:58:00.000000000, 2011-11-21T12:...
9	TotalSum	float64	2940	[114.72, 191.73000000000002]

	RecentTrans	Frequency	MonetaryValue
	mean	mean	mean	count
Cluster
0	194.0	16.0	303.0	1552
1	21.0	258.0	6419.0	1005
2	78.0	64.0	1117.0	1782