# main.py
# A demonstration of a simple content-based recommender system.
#
# Before running, you may need to install scikit-learn and pandas:
# pip install scikit-learn pandas
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
print("--- Starting Content-Based Recommender System Demonstration ---")
# --- Section 1: Create a Sample Dataset ---
# In a real-world scenario, this data would come from a larger database.
# We create a simple pandas DataFrame with movie titles and their descriptions.
data = {
'title': [
'The Matrix', 'Inception', 'Interstellar',
'The Martian', 'Blade Runner 2049', 'Gravity',
'Arrival', 'Ex Machina'
],
'description': [
'A computer hacker learns about the true nature of his reality.',
'A thief who steals corporate secrets through use of dream-sharing technology.',
'A team of explorers travel through a wormhole in space in an attempt to ensure humanity\'s survival.',
'An astronaut becomes stranded on Mars after his team assumes him dead.',
'A new blade runner unearths a long-buried secret that has the potential to plunge what\'s left of society into chaos.',
'Two astronauts work together to survive after an accident leaves them stranded in space.',
'A linguist works with the military to communicate with alien lifeforms after twelve mysterious spacecraft appear around the world.',
'A young programmer is selected to evaluate the human qualities of a highly advanced humanoid A.I.'
]
}
df = pd.DataFrame(data)
print("\n--- 1. Sample Movie Dataset ---")
print(df)
# --- Section 2: Feature Extraction (TF-IDF) ---
# We need to convert the text descriptions into a numerical format that our
# model can understand. We'll use TF-IDF (Term Frequency-Inverse Document Frequency).
# TF-IDF gives more weight to words that are important to a document but not
# common across all documents.
print("\n--- 2. Converting Text to Numerical Features using TF-IDF ---")
# Create a TfidfVectorizer object. stop_words='english' removes common English words.
tfidf = TfidfVectorizer(stop_words='english')
# Fit and transform the data, creating a matrix of TF-IDF features.
tfidf_matrix = tfidf.fit_transform(df['description'])
print("TF-IDF matrix created with shape:", tfidf_matrix.shape)
# --- Section 3: Compute Cosine Similarity ---
# Now we compute the cosine similarity between all pairs of movies based on
# their TF-IDF vectors. A higher score means the descriptions are more similar.
print("\n--- 3. Computing Cosine Similarity Matrix ---")
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print("Cosine similarity matrix created with shape:", cosine_sim.shape)
# --- Section 4: Build the Recommender Function ---
# This function will take a movie title and return the most similar movies.
def get_recommendations(title, cosine_sim_matrix, dataframe):
"""
Finds movies most similar to a given title.
"""
# Get the index of the movie that matches the title
try:
idx = dataframe.index[dataframe['title'] == title].tolist()[0]
except IndexError:
return f"Movie '{title}' not found in the dataset."
# Get the pairwise similarity scores of all movies with that movie
sim_scores = list(enumerate(cosine_sim_matrix[idx]))
# Sort the movies based on the similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# Get the scores of the 3 most similar movies (excluding the movie itself)
sim_scores = sim_scores[1:4]
# Get the movie indices
movie_indices = [i[0] for i in sim_scores]
# Return the top 3 most similar movies
return dataframe['title'].iloc[movie_indices]
# --- Section 5: Get Recommendations ---
print("\n--- 5. Getting Recommendations ---")
movie_title = 'Gravity'
recommendations = get_recommendations(movie_title, cosine_sim, df)
print(f"\nRecommendations for '{movie_title}':")
if isinstance(recommendations, str):
print(recommendations)
else:
for i, rec in enumerate(recommendations):
print(f"{i+1}. {rec}")
print("\n--- End of Demonstration ---")