jax_kmeans_customer_segmentation.py

# jax_kmeans_customer_segmentation.py

import jax
import jax.numpy as jnp
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_score

# Define the k-Means clustering algorithm
def jax_kmeans(data, num_clusters, num_iterations):
    # Initialize cluster centroids randomly
    centroids = data[np.random.choice(data.shape[0], num_clusters, replace=False)]
    
    for _ in range(num_iterations):
        # Assign each data point to the nearest centroid
        distances = jax.vmap(lambda x: jnp.linalg.norm(x - centroids, axis=1))(data)
        cluster_assignments = jnp.argmin(distances, axis=1)
        
        # Update cluster centroids based on the mean of assigned data points
        def update_centroid(i, centroids):
            mask = cluster_assignments == i
            if jnp.sum(mask) > 0:
                centroids = centroids.at[i].set(jnp.mean(data[mask], axis=0))
            return centroids
        
        centroids = jax.lax.fori_loop(0, num_clusters, update_centroid, centroids)
    
    return centroids, cluster_assignments

# Preprocess the customer data
def preprocess_data(data):
    # Normalize the data (example placeholder preprocessing)
    data = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
    return data

# Evaluate the clustering quality using silhouette score
def evaluate_clustering(data, cluster_assignments):
    score = silhouette_score(data, cluster_assignments)
    return score

# Example usage
# Generate sample customer data
num_samples = 1000
num_features = 5
data, _ = make_blobs(n_samples=num_samples, centers=4, n_features=num_features, random_state=42)

# Preprocess the data
preprocessed_data = preprocess_data(data)

# Set the number of clusters and iterations
num_clusters = 4
num_iterations = 10

# Perform k-Means clustering
centroids, cluster_assignments = jax_kmeans(preprocessed_data, num_clusters, num_iterations)

# Evaluate the clustering quality
silhouette_avg = evaluate_clustering(preprocessed_data, cluster_assignments)
print(f"Average Silhouette Score: {silhouette_avg:.3f}")

# Print the cluster assignments for each customer
for i in range(num_samples):
    print(f"Customer {i+1} belongs to Cluster {cluster_assignments[i]+1}")

# Possible Errors and Solutions:
# 1. ValueError: If the number of clusters is greater than the number of samples, the initialization will fail.
#    Solution: Ensure that the number of clusters is less than the number of samples.

# 2. Convergence Issues: If the number of iterations is too low, the algorithm might not converge.
#    Solution: Increase the number of iterations to ensure convergence.

# 3. NaNs in Data: NaNs in the input data can cause errors in distance calculations.
#    Solution: Handle NaNs by removing or imputing them before clustering.