In [1]:
from ucimlrepo import fetch_ucirepo
# 1. Fetch the UCI Online Retail dataset
online_retail = fetch_ucirepo(id=352)
# 2. Import required libraries
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
# 3. Load the features DataFrame
df = online_retail.data.features
print("Columns:", df.columns.tolist()) # Show available columns for reference
# 4. Convert 'InvoiceDate' to datetime and drop rows with missing CustomerID
df['datetime'] = pd.to_datetime(df['InvoiceDate'])
df = df.dropna(subset=['CustomerID']).copy()
# 5. Calculate the total price for each row (Quantity * UnitPrice)
df['total'] = df['Quantity'] * df['UnitPrice']
# 6. Set reference date for Recency calculation (the day after the last transaction)
NOW = df['datetime'].max() + pd.Timedelta(days=1)
# 7. RFM Calculation:
# - Recency: Days since last purchase for each customer
# - Frequency: Number of unique purchase dates (as InvoiceNo is missing)
# - Monetary: Total amount spent by each customer
rfm = df.groupby('CustomerID').agg(
recency=('datetime', lambda x: (NOW - x.max()).days),
frequency=('datetime', 'nunique'),
monetary=('total', 'sum')
).reset_index()
# 8. Robust quantile binning for RFM scores
def safe_qcut(series, q, labels):
"""
Assigns quantile-based scores to a series, handling cases where there are
not enough unique values to form the desired number of bins.
"""
try:
return pd.qcut(series, q, labels=labels, duplicates='drop').astype(int)
except ValueError:
n_bins = min(len(pd.unique(series)), q)
return pd.qcut(series, n_bins, labels=labels[:n_bins], duplicates='drop').astype(int)
# 9. Assign R, F, M scores (higher is better for F/M, lower is better for R)
# - Recency: Lower recency = more recent = higher score
recency_bins = len(pd.qcut(rfm['recency'], 5, duplicates='drop').unique())
rfm['r_score'] = pd.qcut(rfm['recency'], recency_bins, labels=list(range(recency_bins, 0, -1)), duplicates='drop').astype(int)
# - Frequency: Higher frequency = higher score
frequency_bins = len(pd.qcut(rfm['frequency'].rank(method='first'), 5, duplicates='drop').unique())
rfm['f_score'] = pd.qcut(rfm['frequency'].rank(method='first'), frequency_bins, labels=list(range(1, frequency_bins+1)), duplicates='drop').astype(int)
# - Monetary: Higher monetary = higher score
monetary_bins = len(pd.qcut(rfm['monetary'], 5, duplicates='drop').unique())
rfm['m_score'] = pd.qcut(rfm['monetary'], monetary_bins, labels=list(range(1, monetary_bins+1)), duplicates='drop').astype(int)
# 10. Combine RFM scores into a single string for segmentation
rfm['rfm_score'] = rfm['r_score'].astype(str) + rfm['f_score'].astype(str) + rfm['m_score'].astype(str)
# 11. Output the sorted RFM table
print(rfm.sort_values(by='rfm_score', ascending=False))
Columns: ['Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']
C:\Users\ubuntu\AppData\Local\Temp\ipykernel_39740\3634326392.py:17: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['datetime'] = pd.to_datetime(df['InvoiceDate'])
CustomerID recency frequency monetary r_score f_score m_score \ 1794 14769.0 3 11 10641.60 5 5 5 942 13599.0 1 31 5030.60 5 5 5 1966 15023.0 3 12 6763.16 5 5 5 4053 17857.0 3 25 26763.34 5 5 5 4054 17858.0 6 12 5155.66 5 5 5 ... ... ... ... ... ... ... ... 324 12738.0 372 1 155.35 1 1 1 1263 14046.0 284 1 154.99 1 1 1 1260 14041.0 266 1 141.41 1 1 1 2332 15503.0 363 1 147.09 1 1 1 2649 15923.0 373 1 127.08 1 1 1 rfm_score 1794 555 942 555 1966 555 4053 555 4054 555 ... ... 324 111 1263 111 1260 111 2332 111 2649 111 [4372 rows x 8 columns]
In [ ]:
# 12. Improved Plot: RFM Score Distribution (future-proofed for seaborn)
plt.figure(figsize=(18,5)) # Wider figure
order = sorted(rfm['rfm_score'].unique())
ax = sns.countplot(
x='rfm_score',
hue='rfm_score',
data=rfm,
order=order,
palette='crest',
legend=False
)
plt.title('RFM Score Distribution', fontsize=16)
plt.xlabel('RFM Score', fontsize=12)
plt.ylabel('Customer Count', fontsize=12)
# Show only every Nth label (increase N for fewer labels)
N = max(1, len(order) // 30)
for idx, label in enumerate(ax.get_xticklabels()):
if idx % N != 0:
label.set_visible(False)
plt.xticks(rotation=45)
for p in ax.patches:
ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', fontsize=9, color='black', xytext=(0, 5),
textcoords='offset points')
plt.tight_layout()
plt.show()
In [ ]:
import numpy as np
# Avoid log(0) by adding a small constant
rfm['monetary_log'] = np.log1p(rfm['monetary'].clip(lower=0).fillna(0))
plt.figure(figsize=(16,7))
scatter = sns.scatterplot(
x='recency', y='frequency',
size='monetary_log', hue='monetary_log',
data=rfm, palette='viridis', alpha=0.7,
sizes=(40, 400), legend='brief'
)
plt.title('Recency vs Frequency (Bubble size & color = log(Monetary))', fontsize=16)
plt.xlabel('Recency (days)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)
plt.legend(title='log(Monetary)', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
c:\Users\ubuntu\.conda\envs\corp-act\Lib\site-packages\pandas\core\arraylike.py:399: RuntimeWarning: invalid value encountered in log1p result = getattr(ufunc, method)(*inputs, **kwargs)
In [5]:
# 14. Improved Plot: Heatmap of Average Monetary Value by R and F Scores
rfm_pivot = rfm.pivot_table(index='f_score', columns='r_score', values='monetary', aggfunc='mean')
plt.figure(figsize=(9,7))
sns.heatmap(rfm_pivot, annot=True, fmt=".0f", cmap='YlGnBu', linewidths=0.5, cbar_kws={'label': 'Avg Monetary Value'})
plt.title('Average Monetary Value by R and F Scores', fontsize=16)
plt.xlabel('Recency Score', fontsize=12)
plt.ylabel('Frequency Score', fontsize=12)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()