Differential Privacy Examples

This section demonstrates how to use SecureML’s differential privacy features to train machine learning models with formal privacy guarantees.

PyTorch Model with Differential Privacy

In this example, we’ll train a PyTorch neural network with differential privacy using Opacus under the hood:

import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from secureml.privacy import differentially_private_train

# Create a synthetic dataset
np.random.seed(42)
n_samples = 1000
n_features = 10

# Generate features and binary classification target
X = np.random.randn(n_samples, n_features)
w = np.random.randn(n_features)
y = (np.dot(X, w) + 0.1 * np.random.randn(n_samples) > 0).astype(int)

# Split and scale the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Prepare data for SecureML (expects DataFrame or numpy array with target as last column)
train_data = np.column_stack((X_train, y_train))

# Define a PyTorch model
class BinaryClassifier(nn.Module):
    def __init__(self, input_dim):
        super(BinaryClassifier, self).__init__()
        self.layer1 = nn.Linear(input_dim, 32)
        self.layer2 = nn.Linear(32, 16)
        self.layer3 = nn.Linear(16, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.sigmoid(self.layer3(x))
        return x

# Create the model
model = BinaryClassifier(X_train.shape[1])

# Train with differential privacy
dp_model = differentially_private_train(
    model=model,
    data=train_data,
    epsilon=1.0,  # Privacy budget
    delta=1e-5,   # Privacy delta
    batch_size=64,
    epochs=5,
    learning_rate=0.001,
    validation_split=0.1,
    framework="pytorch",
    verbose=True
)

# Evaluate the model
dp_model.eval()
with torch.no_grad():
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_pred = dp_model(X_test_tensor).numpy().flatten() > 0.5
    accuracy = np.mean(y_pred == y_test)
    print(f"Test accuracy with differential privacy: {accuracy:.4f}")

TensorFlow Model with Differential Privacy

SecureML also supports differential privacy for TensorFlow models using TensorFlow Privacy in an isolated environment:

import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from secureml.privacy import differentially_private_train

# Create a synthetic multi-class dataset
np.random.seed(42)
n_samples = 1000
n_features = 10
n_classes = 3

# Generate features and multi-class classification target
X = np.random.randn(n_samples, n_features)
w = np.random.randn(n_features, n_classes)
logits = np.dot(X, w)
y = np.argmax(logits, axis=1)

# Split and scale the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Prepare data for SecureML
train_data = np.column_stack((X_train, y_train))

# Define a TensorFlow model
tf_model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu', input_shape=(n_features,)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(n_classes, activation='softmax')
])

# Compile the model
tf_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train with differential privacy
dp_tf_model = differentially_private_train(
    model=tf_model,
    data=train_data,
    epsilon=1.0,
    delta=1e-5,
    max_grad_norm=1.0,
    batch_size=64,
    epochs=5,
    framework="tensorflow",
    verbose=True
)

# Evaluate the model
test_loss, test_accuracy = dp_tf_model.evaluate(X_test, y_test, verbose=0)
print(f"Test accuracy with differential privacy: {test_accuracy:.4f}")

Privacy-Utility Tradeoff

One important aspect of differential privacy is understanding the tradeoff between privacy and utility. Here’s an example that evaluates model performance across different privacy budgets (epsilon values):

import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from secureml.privacy import differentially_private_train

# Create dataset and prepare data as in previous examples
# ...

# Define different privacy budgets to test
epsilons = [0.1, 0.5, 1.0, 5.0, 10.0]
accuracies = []

# Function to create a model with the same architecture
def create_model():
    return nn.Sequential(
        nn.Linear(n_features, 32),
        nn.ReLU(),
        nn.Linear(32, 16),
        nn.ReLU(),
        nn.Linear(16, 1),
        nn.Sigmoid()
    )

# Train and evaluate for each epsilon
for epsilon in epsilons:
    model = create_model()

    # Train with differential privacy
    dp_model = differentially_private_train(
        model=model,
        data=train_data,
        epsilon=epsilon,
        delta=1e-5,
        batch_size=64,
        epochs=5,
        framework="pytorch",
        verbose=False
    )

    # Evaluate
    dp_model.eval()
    with torch.no_grad():
        X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
        y_pred = dp_model(X_test_tensor).numpy().flatten() > 0.5
        accuracy = np.mean(y_pred == y_test)
        accuracies.append(accuracy)

# Train a non-private model for comparison
non_private_model = create_model()
# Train the non-private model...

# Plot the privacy-utility tradeoff
plt.figure(figsize=(10, 6))
plt.plot(epsilons, accuracies, 'o-', label='DP Model')
plt.axhline(y=non_private_accuracy, color='r', linestyle='--', label='Non-private Model')
plt.xscale('log')
plt.xlabel('Privacy Budget (ε)')
plt.ylabel('Test Accuracy')
plt.title('Privacy-Utility Tradeoff')
plt.grid(True, alpha=0.3)
plt.legend()
plt.savefig('privacy_utility_tradeoff.png')

This will produce a graph showing how model accuracy changes as the privacy budget (epsilon) increases. Typically, as epsilon increases (less privacy), accuracy improves and approaches the non-private model’s performance.

Federated Learning with Differential Privacy

SecureML allows combining federated learning with differential privacy for enhanced privacy guarantees:

import torch
import torch.nn as nn
from secureml.federated import start_federated_client

# Define a model
model = nn.Sequential(
    nn.Linear(10, 32),
    nn.ReLU(),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, 1),
    nn.Sigmoid()
)

# Start a federated learning client with differential privacy
start_federated_client(
    model=model,
    data=client_data,  # Client's local dataset
    server_address="localhost:8080",
    apply_differential_privacy=True,
    epsilon=1.0,
    delta=1e-5,
    max_grad_norm=1.0
)

Advanced Options and Best Practices

Setting Hyperparameters

When training with differential privacy, several hyperparameters can significantly affect both privacy and utility:

dp_model = differentially_private_train(
    model=model,
    data=train_data,
    epsilon=1.0,
    delta=1e-5,

    # Noise and clipping parameters
    noise_multiplier=None,  # Auto-calculated from epsilon if None
    max_grad_norm=1.0,      # Clipping threshold for gradients

    # Training parameters
    batch_size=64,          # Larger batch sizes need less noise
    learning_rate=0.001,    # May need adjustment compared to non-DP training
    epochs=10,

    # Validation and early stopping
    validation_split=0.2,
    early_stopping_patience=3,

    # Other parameters
    verbose=True,
    framework="pytorch"     # or "tensorflow"
)

Data Preparation Tips

For better performance with differential privacy:

Normalize your data: Normalized features perform better with gradient clipping
Balance classes: Imbalanced datasets can make private training more challenging
Remove outliers: Extreme values can have a disproportionate effect on gradients

# Example of proper data preparation
from sklearn.preprocessing import StandardScaler

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create a balanced subsample if needed
from sklearn.utils import resample
X_balanced, y_balanced = resample(X_train, y_train, stratify=y_train, random_state=42)

Monitoring Privacy Budget

Both PyTorch and TensorFlow implementations allow you to monitor the privacy budget spent:

# For PyTorch (Opacus)
from opacus import PrivacyEngine

# After training with Opacus, the privacy engine has a get_epsilon method
privacy_engine = PrivacyEngine()
# Training code...

# Get the privacy budget spent
spent_epsilon = privacy_engine.get_epsilon(delta=1e-5)
print(f"Privacy budget spent (ε = {spent_epsilon:.4f})")

# For TensorFlow Privacy, the spent budget is returned as part of the result
# when verbose=True is set in differentially_private_train