Study | StudyLover

Program for Clustering Algorithms

Download

Program for Classification Algorithms : Program for a Simple Content-Based Recommender System

Programs

# main.py

# A demonstration of key clustering algorithms using scikit-learn.

# Before running, you may need to install scikit-learn and matplotlib:

# pip install scikit-learn matplotlib

import numpy as np

import matplotlib.pyplot as plt

import os

from sklearn.datasets import make_blobs

from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans, DBSCAN

print("--- Starting Clustering Algorithms Demonstration ---")

# --- Section 1: Generate and Prepare the Dataset ---

# We will use make_blobs to create a dataset with distinct groups of points.

# This is ideal for demonstrating how clustering algorithms work.

print("\n--- 1. Generating Sample Dataset ---")

# Create 300 data points in 3 distinct clusters

X, y_true = make_blobs(n_samples=300, centers=3, cluster_std=0.8, random_state=42)

# Scale the features for algorithms like DBSCAN that are sensitive to distance.

X_scaled = StandardScaler().fit_transform(X)

print("Sample data with 3 distinct blobs has been generated and scaled.")

# --- Section 2: K-Means Clustering ---

# K-Means aims to partition n observations into k clusters in which each

# observation belongs to the cluster with the nearest mean.

print("\n--- 2. Applying K-Means Clustering ---")

# We specify that we are looking for 3 clusters.

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)

kmeans_labels = kmeans.fit_predict(X_scaled)

print("K-Means has assigned a cluster label to each data point.")

# --- Section 3: DBSCAN Clustering ---

# DBSCAN (Density-Based Spatial Clustering of Applications with Noise)

# groups together points that are closely packed together, marking as outliers

# points that lie alone in low-density regions.

print("\n--- 3. Applying DBSCAN Clustering ---")

# eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other.

# min_samples: The number of samples in a neighborhood for a point to be considered as a core point.

dbscan = DBSCAN(eps=0.5, min_samples=5)

dbscan_labels = dbscan.fit_predict(X_scaled)

# Note: DBSCAN labels noise points as -1.

n_clusters_dbscan = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)

print(f"DBSCAN found {n_clusters_dbscan} clusters and identified noise points.")

# --- Section 4: Visualization ---

# Plotting the results of both algorithms to compare them.

print("\n--- 4. Visualizing the Clustering Results ---")

try:

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))

# Plot K-Means results

ax1.scatter(X_scaled[:, 0], X_scaled[:, 1], c=kmeans_labels, cmap='viridis', s=50, alpha=0.8)

ax1.set_title('K-Means Clustering Results')

ax1.set_xlabel('Feature 1 (Scaled)')

ax1.set_ylabel('Feature 2 (Scaled)')

# Plot DBSCAN results

ax2.scatter(X_scaled[:, 0], X_scaled[:, 1], c=dbscan_labels, cmap='plasma', s=50, alpha=0.8)

ax2.set_title('DBSCAN Clustering Results')

ax2.set_xlabel('Feature 1 (Scaled)')

fig.suptitle('Clustering Algorithm Comparison', fontsize=16)

# Save the plot to a file

plot_filename = 'clustering_comparison.png'

plt.savefig(plot_filename)

print(f"Comparison plot saved as '{plot_filename}'")

plt.show() # Display the plot

plt.close()

except Exception as e:

print(f"An error occurred during visualization: {e}")

# --- Clean up the created image file ---

finally:

if 'plot_filename' in locals() and os.path.exists(plot_filename):

os.remove(plot_filename)

print(f"\n--- Clean up: Removed '{plot_filename}' ---")

print("\n--- End of Demonstration ---")

Program for Classification Algorithms Program for a Simple Content-Based Recommender System