import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler # data standardization
import matplotlib.pyplot as plt # data viz
from sklearn.cluster import KMeans # KMeans Algo
import seaborn as sns # data viz

# read in data
f_path = 'online_retail.csv'
data = pd.read_csv(f_path)

data.head(3)

# drop columns 'StockCode' and 'Description'
data.drop(columns=['StockCode', 'Description'], inplace=True)

# cast 'CustomerID' feature to int
data['CustomerID'] = data['CustomerID'].astype(int)

# convert 'InvoiceDate' to a datetime.date
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate']).dt.date

# get the most recent purchase date by customer
most_recent_purchase = data.groupby('CustomerID')
most_recent_purchase = most_recent_purchase.agg({'InvoiceDate': 'max'})
most_recent_purchase.rename(columns={'InvoiceDate': 'MostRecentPurchase'}, inplace=True)

# calculate the most recent date in the dataset
most_recent_date = data['InvoiceDate'].max()

# calculate recency
recency = (most_recent_date - most_recent_purchase)
recency.rename(columns={'MostRecentPurchase': 'Recency'}, inplace=True)
recency['Recency'] = recency['Recency'].apply(lambda x: int(x.days))

data.head(3)

# calculate quantity x price and total customer spend
order_value = data[['CustomerID', 'Quantity', 'UnitPrice']].copy()
order_value['OrderValue'] = order_value['Quantity'] * order_value['UnitPrice']
total_customer_spend = order_value.groupby('CustomerID')
total_customer_spend = total_customer_spend.agg({'OrderValue': 'sum'})
total_customer_spend.rename(columns={'OrderValue': 'TotalCustomerSpend'}, inplace=True)

# calculate each customer's first purchase date
first_purchase_date = data.groupby('CustomerID').agg({'InvoiceDate': 'min'})
first_purchase_date.rename(columns={'InvoiceDate': 'FirstPurchaseDate'}, inplace=True)

# calculate most recent transaction date in the dataset
most_recent_date = data['InvoiceDate'].max()

# calculate time from first transaction to most recent
account_age = pd.DataFrame((most_recent_date - first_purchase_date['FirstPurchaseDate']).apply(lambda x: int(x.days)))
account_age.rename(columns={'FirstPurchaseDate': 'AccountAge'}, inplace=True)

# join total customer spend with account age to calculate the monetary metric
monetary = account_age.join(total_customer_spend)
monetary['Monetary'] = monetary['TotalCustomerSpend'] / monetary['AccountAge']
monetary.drop(columns=['AccountAge', 'TotalCustomerSpend'], inplace=True)
monetary.head(3)

# mrt is the customers most recent transaction date
mrt = data.groupby('CustomerID').agg({'InvoiceDate': 'max'})
mrt.rename(columns={'InvoiceDate': 'MostRecentTransaction'}, inplace=True)

# ft is the customers earliest invoice date
ft = data.groupby('CustomerID').agg({'InvoiceDate': 'min'})
ft.rename(columns={'InvoiceDate': 'FirstTransaction'}, inplace=True)

# invoice dates grouped by customer and invoice date
invoice_dates = data.groupby(['CustomerID', 'InvoiceDate'])
invoice_dates = invoice_dates.agg({'InvoiceNo': 'first'}).reset_index()
invoice_dates.drop(columns='InvoiceNo', inplace=True)
invoice_dates.set_index('CustomerID', inplace=True)

# calculate number of transactions per customer (k)
k = invoice_dates.groupby('CustomerID')
k = k.agg({'InvoiceDate': 'nunique'})
k = k.rename(columns={'InvoiceDate': 'K'})

# join most recent transaction, first 
# transaction and number of transactions
frequency = mrt.join(ft).join(k) 

# only keep customers who made two or 
# more transactions because frequency is
# infinite for a single transaction
frequency = frequency[frequency['K'] > 1].copy()

# adjust k to be k-1 to match the defintion
# of frequency
frequency['K'] = frequency['K'] - 1

# calculate frequency
frequency['Frequency'] = (frequency['MostRecentTransaction'] - frequency['FirstTransaction'])
frequency['Frequency'] = frequency['Frequency'].apply(lambda x: int(x.days))
frequency['Frequency'] = frequency['Frequency']/(frequency['K'])

# drop unnecessary columns
frequency.drop(columns=['MostRecentTransaction', 'FirstTransaction', 'K'], inplace=True)

# join recency, frequency and monetary data
rfm_data = recency.join(frequency).join(monetary)
rfm_data.dropna(inplace=True)

# KMeans clustering of rfm data with 4 clusters
n_clusters = 4
random_state = 0
rfm_kmeans = KMeans(
    n_clusters=n_clusters, 
    random_state=random_state
)
rfm_kmeans.fit(rfm_data)

# extract the centroids from the fit model
centroids = rfm_kmeans.cluster_centers_

# create a new column with predicted clusters on rfm data
rfm_data['RFMCluster'] = rfm_kmeans.predict(rfm_data)

# build a dataframe with means of recency, frequency
# and monetary, broken apart by cluster
cluster_data = rfm_data.groupby('RFMCluster')
cluster_data = cluster_data.agg({
    'Recency': 'mean', 'Frequency': 'mean', 'Monetary': 'mean'
})
cluster_data

# function to score recency values
def r_score(recency_value):
    if recency_value <= r_quantiles[0.2]:
        return 5
    elif recency_value <= r_quantiles[0.4]:
        return 4
    elif recency_value <= r_quantiles[0.6]:
        return 3
    elif recency_value <= r_quantiles[0.8]:
        return 2
    else:
        return 1

# function to score frequency values
def f_score(frequency_value):
    if frequency_value <= f_quantiles[0.2]:
        return 5
    elif frequency_value <= f_quantiles[0.4]:
        return 4
    elif frequency_value <= f_quantiles[0.6]:
        return 3
    elif frequency_value <= f_quantiles[0.8]:
        return 2
    else:
        return 1

# function to score monetary values
def m_score(monetary_value):
    if monetary_value <= m_quantiles[0.2]:
        return 1
    elif monetary_value <= m_quantiles[0.4]:
        return 2
    elif monetary_value <= m_quantiles[0.6]:
        return 3
    elif monetary_value <= m_quantiles[0.8]:
        return 4
    else:
        return 5

# calculate r quantiles for r scoring
r_quantiles = rfm_data['Recency'].quantile([0.2, 0.4, 0.6, 0.8])

# calculate f quantiles for f scoring
f_quantiles = rfm_data['Frequency'].quantile([0.2, 0.4, 0.6, 0.8])

# calculate m quantiles for m scoring
m_quantiles = rfm_data['Monetary'].quantile([0.2, 0.4, 0.6, 0.8])

# calculate r,f and m scores
rfm_data['RScore'] = rfm_data['Recency'].apply(r_score)
rfm_data['FScore'] = rfm_data['Frequency'].apply(f_score)
rfm_data['MScore'] = rfm_data['Monetary'].apply(m_score)

# calculate average RFM scores by cluster
cluster_scores = rfm_data.groupby('RFMCluster')
cluster_scores = cluster_scores.agg({'RScore': 'mean', 'FScore': 'mean', 'MScore': 'mean'})
cluster_scores = cluster_scores.round().astype(int)

# combine cluster scores into one 3-digit code
cluster_scores['ClusterScores'] = (
    cluster_scores['RScore'].astype(str) + 
    cluster_scores['FScore'].astype(str) + 
    cluster_scores['MScore'].astype(str)
)

# average RFM scores by cluster
cluster_scores

# calculate min and max date of the dataset
# to determine N
min_date = data['InvoiceDate'].min()
max_date = data['InvoiceDate'].max()

# calculate N
N = (max_date - min_date).days

# define slope and intercept for RFM boundary life
intercept = N
slope = -1

# create a 10x6 figure
plt.figure(figsize=(10, 6))

# set up scatter plot
sns.scatterplot(
    data=rfm_data, x='Frequency', y='Recency', 
    hue='RFMCluster', palette='viridis', s=100, alpha=0.6
)

# title, label and legend text
plt.title('Customer Segments Based on Frequency and Recency of Purchase')
plt.xlabel('Average Days Between Purchases (Frequency)')
plt.ylabel('Days Since Last Purchase (Recency)')
plt.legend(title='Segment')

# add x and y axis lines
plt.axhline(y=0, color='black')
plt.axvline(x=0, color='black')

# draw RFM Score boxes and text on centroids
for i in range(centroids.shape[0]):
    (
        plt.text(centroids[i][1], centroids[i][0], 
        s=cluster_scores.loc[i, 'ClusterScores'], 
        bbox=dict(facecolor='white'))
    )

# draw RMF boundary dashed line
axes = plt.gca()
x = np.array(axes.get_xlim())
y = intercept + slope * x
plt.plot(x, y, '--')

# save figure and show it
plt.savefig('ccs.png', bbox_inches='tight')
plt.show()

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID
0	536365	85123A	WHITE HANGING HEART T-LIGHT HOLDER	6	12/1/2010 8:26	2.55	17850.0
1	536365	71053	WHITE METAL LANTERN	6	12/1/2010 8:26	3.39	17850.0
2	536365	84406B	CREAM CUPID HEARTS COAT HANGER	8	12/1/2010 8:26	2.75	17850.0

	InvoiceNo	Quantity	InvoiceDate	UnitPrice	CustomerID
0	536365	6	2010-12-01	2.55	17850
1	536365	6	2010-12-01	3.39	17850
2	536365	8	2010-12-01	2.75	17850

	Monetary
CustomerID
12747	11.287561
12748	77.512413
12749	18.830892

	Recency	Frequency	Monetary
RFMCluster
0	19.583051	50.994689	12.981322
1	42.113269	224.430421	2.419265
2	222.645161	60.128648	2.507621
3	95.102450	84.252849	4.343776

RFM Analysis for Strategic Marketing Segmentation by Barrett Duna¶

Executive Summary¶

Mathematical Details of the Triangular Plot¶

RFM Scoring¶

Segment Strategic Analyses¶

Segment 444¶

Segment 132¶

Segment 312¶

Segment 222¶

Introduction¶

Fundamental Definitions¶

Variable Constraints¶

Special Cases¶

CASE #1:¶

CASE #2:¶

CASE #3:¶

Derivation of the Boundary Line Equation¶

Proof of Lower Triangular Property¶

Maximizing $f$¶

Maximizing $r$¶

Preliminaries¶

Purchase Recency Analysis¶

Purchase Monetary Analysis¶

Purchase Frequency Analysis¶

Join RFM Values¶

KMeans RMF Clustering¶

RFM Cluster Aggregation¶

RFM Cluster Plot¶

	RScore	FScore	MScore	ClusterScores
RFMCluster
0	4	4	4	444
1	3	1	2	312
2	1	3	2	132
3	2	2	2	222

RFM Analysis for Strategic Marketing Segmentation by Barrett Duna¶

Executive Summary¶

Mathematical Details of the Triangular Plot¶

RFM Scoring¶

Segment Strategic Analyses¶

Segment 444¶

Segment 132¶

Segment 312¶

Segment 222¶

Mathematical Derivations and Proofs Related to the Triangular Plot¶

Introduction¶

Fundamental Definitions¶

Variable Constraints¶

Special Cases¶

CASE #1:¶

CASE #2:¶

CASE #3:¶

Derivation of the Boundary Line Equation¶

Proof of Lower Triangular Property¶

Maximizing $f$¶

Maximizing $r$¶

Preliminaries¶

Purchase Recency Analysis¶

Purchase Monetary Analysis¶

Purchase Frequency Analysis¶

Join RFM Values¶

KMeans RMF Clustering¶

RFM Cluster Aggregation¶

RFM Cluster Plot¶