from ucimlrepo import fetch_ucirepo

# 1. Fetch the UCI Online Retail dataset
online_retail = fetch_ucirepo(id=352)

# 2. Import required libraries
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns

# 3. Load the features DataFrame
df = online_retail.data.features
print("Columns:", df.columns.tolist())  # Show available columns for reference

# 4. Convert 'InvoiceDate' to datetime and drop rows with missing CustomerID
df['datetime'] = pd.to_datetime(df['InvoiceDate'])
df = df.dropna(subset=['CustomerID']).copy()

# 5. Calculate the total price for each row (Quantity * UnitPrice)
df['total'] = df['Quantity'] * df['UnitPrice']

# 6. Set reference date for Recency calculation (the day after the last transaction)
NOW = df['datetime'].max() + pd.Timedelta(days=1)

# 7. RFM Calculation:
#    - Recency: Days since last purchase for each customer
#    - Frequency: Number of unique purchase dates (as InvoiceNo is missing)
#    - Monetary: Total amount spent by each customer
rfm = df.groupby('CustomerID').agg(
    recency=('datetime', lambda x: (NOW - x.max()).days),
    frequency=('datetime', 'nunique'),
    monetary=('total', 'sum')
).reset_index()

# 8. Robust quantile binning for RFM scores
def safe_qcut(series, q, labels):
    """
    Assigns quantile-based scores to a series, handling cases where there are
    not enough unique values to form the desired number of bins.
    """
    try:
        return pd.qcut(series, q, labels=labels, duplicates='drop').astype(int)
    except ValueError:
        n_bins = min(len(pd.unique(series)), q)
        return pd.qcut(series, n_bins, labels=labels[:n_bins], duplicates='drop').astype(int)

# 9. Assign R, F, M scores (higher is better for F/M, lower is better for R)
#    - Recency: Lower recency = more recent = higher score
recency_bins = len(pd.qcut(rfm['recency'], 5, duplicates='drop').unique())
rfm['r_score'] = pd.qcut(rfm['recency'], recency_bins, labels=list(range(recency_bins, 0, -1)), duplicates='drop').astype(int)

#    - Frequency: Higher frequency = higher score
frequency_bins = len(pd.qcut(rfm['frequency'].rank(method='first'), 5, duplicates='drop').unique())
rfm['f_score'] = pd.qcut(rfm['frequency'].rank(method='first'), frequency_bins, labels=list(range(1, frequency_bins+1)), duplicates='drop').astype(int)

#    - Monetary: Higher monetary = higher score
monetary_bins = len(pd.qcut(rfm['monetary'], 5, duplicates='drop').unique())
rfm['m_score'] = pd.qcut(rfm['monetary'], monetary_bins, labels=list(range(1, monetary_bins+1)), duplicates='drop').astype(int)

# 10. Combine RFM scores into a single string for segmentation
rfm['rfm_score'] = rfm['r_score'].astype(str) + rfm['f_score'].astype(str) + rfm['m_score'].astype(str)

# 11. Output the sorted RFM table
print(rfm.sort_values(by='rfm_score', ascending=False))

Columns: ['Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']

C:\Users\ubuntu\AppData\Local\Temp\ipykernel_39740\3634326392.py:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = pd.to_datetime(df['InvoiceDate'])

      CustomerID  recency  frequency  monetary  r_score  f_score  m_score  \
1794     14769.0        3         11  10641.60        5        5        5   
942      13599.0        1         31   5030.60        5        5        5   
1966     15023.0        3         12   6763.16        5        5        5   
4053     17857.0        3         25  26763.34        5        5        5   
4054     17858.0        6         12   5155.66        5        5        5   
...          ...      ...        ...       ...      ...      ...      ...   
324      12738.0      372          1    155.35        1        1        1   
1263     14046.0      284          1    154.99        1        1        1   
1260     14041.0      266          1    141.41        1        1        1   
2332     15503.0      363          1    147.09        1        1        1   
2649     15923.0      373          1    127.08        1        1        1   

     rfm_score  
1794       555  
942        555  
1966       555  
4053       555  
4054       555  
...        ...  
324        111  
1263       111  
1260       111  
2332       111  
2649       111  

[4372 rows x 8 columns]

# 12. Improved Plot: RFM Score Distribution (future-proofed for seaborn)
plt.figure(figsize=(18,5))  # Wider figure
order = sorted(rfm['rfm_score'].unique())
ax = sns.countplot(
    x='rfm_score', 
    hue='rfm_score', 
    data=rfm, 
    order=order, 
    palette='crest', 
    legend=False
)
plt.title('RFM Score Distribution', fontsize=16)
plt.xlabel('RFM Score', fontsize=12)
plt.ylabel('Customer Count', fontsize=12)

# Show only every Nth label (increase N for fewer labels)
N = max(1, len(order) // 30)
for idx, label in enumerate(ax.get_xticklabels()):
    if idx % N != 0:
        label.set_visible(False)
plt.xticks(rotation=45)

for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', fontsize=9, color='black', xytext=(0, 5),
                textcoords='offset points')
plt.tight_layout()
plt.show()

import numpy as np

# Avoid log(0) by adding a small constant
rfm['monetary_log'] = np.log1p(rfm['monetary'].clip(lower=0).fillna(0))

plt.figure(figsize=(16,7))
scatter = sns.scatterplot(
    x='recency', y='frequency',
    size='monetary_log', hue='monetary_log',
    data=rfm, palette='viridis', alpha=0.7,
    sizes=(40, 400), legend='brief'
)
plt.title('Recency vs Frequency (Bubble size & color = log(Monetary))', fontsize=16)
plt.xlabel('Recency (days)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)
plt.legend(title='log(Monetary)', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

c:\Users\ubuntu\.conda\envs\corp-act\Lib\site-packages\pandas\core\arraylike.py:399: RuntimeWarning: invalid value encountered in log1p
  result = getattr(ufunc, method)(*inputs, **kwargs)

# 14. Improved Plot: Heatmap of Average Monetary Value by R and F Scores
rfm_pivot = rfm.pivot_table(index='f_score', columns='r_score', values='monetary', aggfunc='mean')
plt.figure(figsize=(9,7))
sns.heatmap(rfm_pivot, annot=True, fmt=".0f", cmap='YlGnBu', linewidths=0.5, cbar_kws={'label': 'Avg Monetary Value'})
plt.title('Average Monetary Value by R and F Scores', fontsize=16)
plt.xlabel('Recency Score', fontsize=12)
plt.ylabel('Frequency Score', fontsize=12)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()