This tutorial will stroll you thru utilizing PyTorch to implement a Neural Collaborative Filtering (NCF) suggestion system. NCF extends conventional matrix factorisation through the use of neural networks to mannequin complicated user-item interactions.
Introduction
Neural Collaborative Filtering (NCF) is a state-of-the-art method for constructing suggestion programs. Not like conventional collaborative filtering strategies that depend on linear fashions, NCF makes use of deep studying to seize non-linear relationships between customers and objects.
On this tutorial, we’ll:
- Put together and discover the MovieLens dataset
- Implement the NCF mannequin structure
- Practice the mannequin
- Consider its efficiency
- Generate suggestions for customers
Setup and Setting
First, let’s set up the required libraries and import them:
!pip set up torch numpy pandas matplotlib seaborn scikit-learn tqdm
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.information import Dataset, DataLoader
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import random
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
machine = torch.machine("cuda" if torch.cuda.is_available() else "cpu")
print(f"Utilizing machine: {machine}")
Knowledge Loading and Preparation
We’ll use the MovieLens 100K dataset, which comprises 100,000 film rankings from customers:
!wget -nc https://information.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -q -n ml-100k.zip
ratings_df = pd.read_csv('ml-100k/u.information', sep='t', names=['user_id', 'item_id', 'rating', 'timestamp'])
movies_df = pd.read_csv('ml-100k/u.merchandise', sep='|', encoding='latin-1',
names=['item_id', 'title', 'release_date', 'video_release_date',
'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation',
'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
'Thriller', 'War', 'Western'])
print("Scores information:")
print(ratings_df.head())
print("nMovies information:")
print(movies_df[['item_id', 'title']].head())
print(f"nTotal variety of rankings: {len(ratings_df)}")
print(f"Variety of distinctive customers: {ratings_df['user_id'].nunique()}")
print(f"Variety of distinctive motion pictures: {ratings_df['item_id'].nunique()}")
print(f"Ranking vary: {ratings_df['rating'].min()} to {ratings_df['rating'].max()}")
print(f"Common ranking: {ratings_df['rating'].imply():.2f}")
plt.determine(figsize=(10, 6))
sns.countplot(x='ranking', information=ratings_df)
plt.title('Distribution of Scores')
plt.xlabel('Ranking')
plt.ylabel('Depend')
plt.present()
ratings_df['label'] = (ratings_df['rating'] >= 4).astype(np.float32)
Knowledge Preparation for NCF
Now, let’s put together the information for our NCF mannequin:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)
print(f"Coaching set dimension: {len(train_df)}")
print(f"Take a look at set dimension: {len(test_df)}")
num_users = ratings_df['user_id'].max()
num_items = ratings_df['item_id'].max()
print(f"Variety of customers: {num_users}")
print(f"Variety of objects: {num_items}")
class NCFDataset(Dataset):
def __init__(self, df):
self.user_ids = torch.tensor(df['user_id'].values, dtype=torch.lengthy)
self.item_ids = torch.tensor(df['item_id'].values, dtype=torch.lengthy)
self.labels = torch.tensor(df['label'].values, dtype=torch.float)
def __len__(self):
return len(self.user_ids)
def __getitem__(self, idx):
return {
'user_id': self.user_ids[idx],
'item_id': self.item_ids[idx],
'label': self.labels[idx]
}
train_dataset = NCFDataset(train_df)
test_dataset = NCFDataset(test_df)
batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
Mannequin Structure
Now we’ll implement the Neural Collaborative Filtering (NCF) mannequin, which mixes Generalized Matrix Factorization (GMF) and Multi-Layer Perceptron (MLP) parts:
class NCF(nn.Module):
def __init__(self, num_users, num_items, embedding_dim=32, mlp_layers=[64, 32, 16]):
tremendous(NCF, self).__init__()
self.user_embedding_gmf = nn.Embedding(num_users + 1, embedding_dim)
self.item_embedding_gmf = nn.Embedding(num_items + 1, embedding_dim)
self.user_embedding_mlp = nn.Embedding(num_users + 1, embedding_dim)
self.item_embedding_mlp = nn.Embedding(num_items + 1, embedding_dim)
mlp_input_dim = 2 * embedding_dim
self.mlp_layers = nn.ModuleList()
for idx, layer_size in enumerate(mlp_layers):
if idx == 0:
self.mlp_layers.append(nn.Linear(mlp_input_dim, layer_size))
else:
self.mlp_layers.append(nn.Linear(mlp_layers[idx-1], layer_size))
self.mlp_layers.append(nn.ReLU())
self.output_layer = nn.Linear(embedding_dim + mlp_layers[-1], 1)
self.sigmoid = nn.Sigmoid()
self._init_weights()
def _init_weights(self):
for m in self.modules():
if isinstance(m, nn.Embedding):
nn.init.normal_(m.weight, imply=0.0, std=0.01)
elif isinstance(m, nn.Linear):
nn.init.kaiming_uniform_(m.weight)
if m.bias just isn't None:
nn.init.zeros_(m.bias)
def ahead(self, user_ids, item_ids):
user_embedding_gmf = self.user_embedding_gmf(user_ids)
item_embedding_gmf = self.item_embedding_gmf(item_ids)
gmf_vector = user_embedding_gmf * item_embedding_gmf
user_embedding_mlp = self.user_embedding_mlp(user_ids)
item_embedding_mlp = self.item_embedding_mlp(item_ids)
mlp_vector = torch.cat([user_embedding_mlp, item_embedding_mlp], dim=-1)
for layer in self.mlp_layers:
mlp_vector = layer(mlp_vector)
concat_vector = torch.cat([gmf_vector, mlp_vector], dim=-1)
prediction = self.sigmoid(self.output_layer(concat_vector)).squeeze()
return prediction
embedding_dim = 32
mlp_layers = [64, 32, 16]
mannequin = NCF(num_users, num_items, embedding_dim, mlp_layers).to(machine)
print(mannequin)
Coaching the Mannequin
Let’s prepare our NCF mannequin:
criterion = nn.BCELoss()
optimizer = optim.Adam(mannequin.parameters(), lr=0.001, weight_decay=1e-5)
def train_epoch(mannequin, data_loader, criterion, optimizer, machine):
mannequin.prepare()
total_loss = 0
for batch in tqdm(data_loader, desc="Coaching"):
user_ids = batch['user_id'].to(machine)
item_ids = batch['item_id'].to(machine)
labels = batch['label'].to(machine)
optimizer.zero_grad()
outputs = mannequin(user_ids, item_ids)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
total_loss += loss.merchandise()
return total_loss / len(data_loader)
def consider(mannequin, data_loader, criterion, machine):
mannequin.eval()
total_loss = 0
predictions = []
true_labels = []
with torch.no_grad():
for batch in tqdm(data_loader, desc="Evaluating"):
user_ids = batch['user_id'].to(machine)
item_ids = batch['item_id'].to(machine)
labels = batch['label'].to(machine)
outputs = mannequin(user_ids, item_ids)
loss = criterion(outputs, labels)
total_loss += loss.merchandise()
predictions.prolong(outputs.cpu().numpy())
true_labels.prolong(labels.cpu().numpy())
from sklearn.metrics import roc_auc_score, average_precision_score
auc = roc_auc_score(true_labels, predictions)
ap = average_precision_score(true_labels, predictions)
return {
'loss': total_loss / len(data_loader),
'auc': auc,
'ap': ap
}
num_epochs = 10
historical past = {'train_loss': [], 'val_loss': [], 'val_auc': [], 'val_ap': []}
for epoch in vary(num_epochs):
train_loss = train_epoch(mannequin, train_loader, criterion, optimizer, machine)
eval_metrics = consider(mannequin, test_loader, criterion, machine)
historical past['train_loss'].append(train_loss)
historical past['val_loss'].append(eval_metrics['loss'])
historical past['val_auc'].append(eval_metrics['auc'])
historical past['val_ap'].append(eval_metrics['ap'])
print(f"Epoch {epoch+1}/{num_epochs} - "
f"Practice Loss: {train_loss:.4f}, "
f"Val Loss: {eval_metrics['loss']:.4f}, "
f"AUC: {eval_metrics['auc']:.4f}, "
f"AP: {eval_metrics['ap']:.4f}")
plt.determine(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(historical past['train_loss'], label="Practice Loss")
plt.plot(historical past['val_loss'], label="Validation Loss")
plt.title('Loss Throughout Coaching')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(historical past['val_auc'], label="AUC")
plt.plot(historical past['val_ap'], label="Common Precision")
plt.title('Analysis Metrics')
plt.xlabel('Epoch')
plt.ylabel('Rating')
plt.legend()
plt.tight_layout()
plt.present()
torch.save(mannequin.state_dict(), 'ncf_model.pth')
print("Mannequin saved efficiently!")
Producing Suggestions
Now let’s create a perform to generate suggestions for customers:
def generate_recommendations(mannequin, user_id, n=10):
mannequin.eval()
user_ids = torch.tensor([user_id] * num_items, dtype=torch.lengthy).to(machine)
item_ids = torch.tensor(vary(1, num_items + 1), dtype=torch.lengthy).to(machine)
with torch.no_grad():
predictions = mannequin(user_ids, item_ids).cpu().numpy()
items_df = pd.DataFrame({
'item_id': vary(1, num_items + 1),
'rating': predictions
})
user_rated_items = set(ratings_df[ratings_df['user_id'] == user_id]['item_id'].values)
items_df = items_df[~items_df['item_id'].isin(user_rated_items)]
top_n_items = items_df.sort_values('rating', ascending=False).head(n)
suggestions = pd.merge(top_n_items, movies_df[['item_id', 'title']], on='item_id')
return suggestions[['item_id', 'title', 'score']]
test_users = [1, 42, 100]
for user_id in test_users:
print(f"nTop 10 suggestions for consumer {user_id}:")
suggestions = generate_recommendations(mannequin, user_id, n=10)
print(suggestions)
print(f"nMovies that consumer {user_id} has rated extremely (4-5 stars):")
user_liked = ratings_df[(ratings_df['user_id'] == user_id) & (ratings_df['rating'] >= 4)]
user_liked = pd.merge(user_liked, movies_df[['item_id', 'title']], on='item_id')
user_liked[['item_id', 'title', 'rating']]
Evaluating the Mannequin Additional
Let’s consider our mannequin additional by computing some extra metrics:
def evaluate_model_with_metrics(mannequin, test_loader, machine):
mannequin.eval()
predictions = []
true_labels = []
with torch.no_grad():
for batch in tqdm(test_loader, desc="Evaluating"):
user_ids = batch['user_id'].to(machine)
item_ids = batch['item_id'].to(machine)
labels = batch['label'].to(machine)
outputs = mannequin(user_ids, item_ids)
predictions.prolong(outputs.cpu().numpy())
true_labels.prolong(labels.cpu().numpy())
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, accuracy_score
binary_preds = [1 if p >= 0.5 else 0 for p in predictions]
auc = roc_auc_score(true_labels, predictions)
ap = average_precision_score(true_labels, predictions)
accuracy = accuracy_score(true_labels, binary_preds)
precision, recall, thresholds = precision_recall_curve(true_labels, predictions)
plt.determine(figsize=(10, 6))
plt.plot(recall, precision, label=f'AP={ap:.3f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid(True)
plt.present()
return {
'auc': auc,
'ap': ap,
'accuracy': accuracy
}
metrics = evaluate_model_with_metrics(mannequin, test_loader, machine)
print(f"AUC: {metrics['auc']:.4f}")
print(f"Common Precision: {metrics['ap']:.4f}")
print(f"Accuracy: {metrics['accuracy']:.4f}")
Chilly Begin Evaluation
Let’s analyze how our mannequin performs for brand spanking new customers or customers with few rankings (chilly begin drawback):
user_rating_counts = ratings_df.groupby('user_id').dimension().reset_index(title="depend")
user_rating_counts['group'] = pd.lower(user_rating_counts['count'],
bins=[0, 10, 50, 100, float('inf')],
labels=['1-10', '11-50', '51-100', '100+'])
print("Variety of customers in every ranking frequency group:")
print(user_rating_counts['group'].value_counts())
def evaluate_by_user_group(mannequin, ratings_df, user_groups, machine):
outcomes = {}
for group_name, user_ids in user_groups.objects():
group_ratings = ratings_df[ratings_df['user_id'].isin(user_ids)]
group_dataset = NCFDataset(group_ratings)
group_loader = DataLoader(group_dataset, batch_size=256, shuffle=False)
if len(group_loader) == 0:
proceed
mannequin.eval()
predictions = []
true_labels = []
with torch.no_grad():
for batch in group_loader:
user_ids = batch['user_id'].to(machine)
item_ids = batch['item_id'].to(machine)
labels = batch['label'].to(machine)
outputs = mannequin(user_ids, item_ids)
predictions.prolong(outputs.cpu().numpy())
true_labels.prolong(labels.cpu().numpy())
from sklearn.metrics import roc_auc_score
attempt:
auc = roc_auc_score(true_labels, predictions)
outcomes[group_name] = auc
besides:
outcomes[group_name] = None
return outcomes
user_groups = {}
for group in user_rating_counts['group'].distinctive():
users_in_group = user_rating_counts[user_rating_counts['group'] == group]['user_id'].values
user_groups[group] = users_in_group
group_performance = evaluate_by_user_group(mannequin, test_df, user_groups, machine)
plt.determine(figsize=(10, 6))
teams = []
aucs = []
for group, auc in group_performance.objects():
if auc just isn't None:
teams.append(group)
aucs.append(auc)
plt.bar(teams, aucs)
plt.xlabel('Variety of Scores per Consumer')
plt.ylabel('AUC Rating')
plt.title('Mannequin Efficiency by Consumer Ranking Frequency (Chilly Begin Evaluation)')
plt.ylim(0.5, 1.0)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.present()
print("AUC scores by consumer ranking frequency:")
for group, auc in group_performance.objects():
if auc just isn't None:
print(f"{group}: {auc:.4f}")
Enterprise Insights and Extensions
def analyze_predictions(mannequin, data_loader, machine):
mannequin.eval()
predictions = []
true_labels = []
with torch.no_grad():
for batch in data_loader:
user_ids = batch['user_id'].to(machine)
item_ids = batch['item_id'].to(machine)
labels = batch['label'].to(machine)
outputs = mannequin(user_ids, item_ids)
predictions.prolong(outputs.cpu().numpy())
true_labels.prolong(labels.cpu().numpy())
results_df = pd.DataFrame({
'true_label': true_labels,
'predicted_score': predictions
})
plt.determine(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(results_df['predicted_score'], bins=30, kde=True)
plt.title('Distribution of Predicted Scores')
plt.xlabel('Predicted Rating')
plt.ylabel('Depend')
plt.subplot(1, 2, 2)
sns.boxplot(x='true_label', y='predicted_score', information=results_df)
plt.title('Predicted Scores by True Label')
plt.xlabel('True Label (0=Disliked, 1=Favored)')
plt.ylabel('Predicted Rating')
plt.tight_layout()
plt.present()
avg_scores = results_df.groupby('true_label')['predicted_score'].imply()
print("Common prediction scores:")
print(f"Gadgets consumer disliked (0): {avg_scores[0]:.4f}")
print(f"Gadgets consumer appreciated (1): {avg_scores[1]:.4f}")
analyze_predictions(mannequin, test_loader, machine)
This tutorial demonstrates implementing Neural Collaborative Filtering, a deep studying suggestion system combining matrix factorization with neural networks. Utilizing the MovieLens dataset and PyTorch, we constructed a mannequin that generates personalised content material suggestions. The implementation addresses key challenges, together with the chilly begin drawback and gives efficiency metrics like AUC and precision-recall curves. This basis might be prolonged with hybrid approaches, consideration mechanisms, or deployable net purposes for varied enterprise suggestion situations.
Right here is the Colab Pocket book. Additionally, don’t neglect to observe us on Twitter and be part of our Telegram Channel and LinkedIn Group. Don’t Overlook to hitch our 85k+ ML SubReddit.