# main.py
# A demonstration of key clustering algorithms using scikit-learn.
#
# Before running, you may need to install scikit-learn and matplotlib:
# pip install scikit-learn matplotlib
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
print("--- Starting Clustering Algorithms Demonstration ---")
# --- Section 1: Generate and Prepare the Dataset ---
# We will use make_blobs to create a dataset with distinct groups of points.
# This is ideal for demonstrating how clustering algorithms work.
print("\n--- 1. Generating Sample Dataset ---")
# Create 300 data points in 3 distinct clusters
X, y_true = make_blobs(n_samples=300, centers=3, cluster_std=0.8, random_state=42)
# Scale the features for algorithms like DBSCAN that are sensitive to distance.
X_scaled = StandardScaler().fit_transform(X)
print("Sample data with 3 distinct blobs has been generated and scaled.")
# --- Section 2: K-Means Clustering ---
# K-Means aims to partition n observations into k clusters in which each
# observation belongs to the cluster with the nearest mean.
print("\n--- 2. Applying K-Means Clustering ---")
# We specify that we are looking for 3 clusters.
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X_scaled)
print("K-Means has assigned a cluster label to each data point.")
# --- Section 3: DBSCAN Clustering ---
# DBSCAN (Density-Based Spatial Clustering of Applications with Noise)
# groups together points that are closely packed together, marking as outliers
# points that lie alone in low-density regions.
print("\n--- 3. Applying DBSCAN Clustering ---")
# eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other.
# min_samples: The number of samples in a neighborhood for a point to be considered as a core point.
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_scaled)
# Note: DBSCAN labels noise points as -1.
n_clusters_dbscan = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
print(f"DBSCAN found {n_clusters_dbscan} clusters and identified noise points.")
# --- Section 4: Visualization ---
# Plotting the results of both algorithms to compare them.
print("\n--- 4. Visualizing the Clustering Results ---")
try:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))
# Plot K-Means results
ax1.scatter(X_scaled[:, 0], X_scaled[:, 1], c=kmeans_labels, cmap='viridis', s=50, alpha=0.8)
ax1.set_title('K-Means Clustering Results')
ax1.set_xlabel('Feature 1 (Scaled)')
ax1.set_ylabel('Feature 2 (Scaled)')
# Plot DBSCAN results
ax2.scatter(X_scaled[:, 0], X_scaled[:, 1], c=dbscan_labels, cmap='plasma', s=50, alpha=0.8)
ax2.set_title('DBSCAN Clustering Results')
ax2.set_xlabel('Feature 1 (Scaled)')
fig.suptitle('Clustering Algorithm Comparison', fontsize=16)
# Save the plot to a file
plot_filename = 'clustering_comparison.png'
plt.savefig(plot_filename)
print(f"Comparison plot saved as '{plot_filename}'")
plt.show() # Display the plot
plt.close()
except Exception as e:
print(f"An error occurred during visualization: {e}")
# --- Clean up the created image file ---
finally:
if 'plot_filename' in locals() and os.path.exists(plot_filename):
os.remove(plot_filename)
print(f"\n--- Clean up: Removed '{plot_filename}' ---")
print("\n--- End of Demonstration ---")