Quick Start Guide
This guide will help you get started with Serverless GPU API quickly.
Basic Usage
Here’s a simple example of how to use Serverless GPU API:
from serverless_gpu.launcher import distributed
from serverless_gpu.compute import GPUType
from serverless_gpu import runtime as rt
@distributed(gpus=8, gpu_type=GPUType.H100)
def hello_world(name: str):
if rt.get_local_rank() == 0:
print('hello', name)
return rt.get_global_rank()
hello_world.distributed(name='world')
Here’s another example of training a simple MLP model using PyTorch’s Distributed Data Parallel (DDP) module:
import os
import torch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader, DistributedSampler, TensorDataset
import torch.nn as nn
import torch.optim as optim
from serverless_gpu import distributed
import mlflow
def setup():
dist.init_process_group("nccl")
torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
def cleanup():
dist.destroy_process_group()
class SimpleMLP(nn.Module):
def __init__(self, input_dim=10, hidden_dim=64, output_dim=1):
super().__init__()
self.net = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(hidden_dim, output_dim)
)
def forward(self, x):
return self.net(x)
@distributed(gpus=8, gpu_type='H100')
def train(num_epochs=3, batch_size=64) -> None:
setup()
rank = dist.get_rank()
device = torch.device(f"cuda:{int(os.environ['LOCAL_RANK'])}")
with mlflow.start_run():
# Model: MLP with hidden layers
model = SimpleMLP().to(device)
model = DDP(model, device_ids=[device])
# Dummy regression dataset
x = torch.randn(5000, 10)
y = 3 * x.sum(dim=1, keepdim=True) + torch.randn(5000, 1) * 0.1
dataset = TensorDataset(x, y)
sampler = DistributedSampler(dataset)
dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size)
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()
for epoch in range(num_epochs):
sampler.set_epoch(epoch)
model.train()
total_loss = 0.0
for step, (xb, yb) in enumerate(dataloader):
xb, yb = xb.to(device), yb.to(device)
optimizer.zero_grad()
loss = loss_fn(model(xb), yb)
# Log per-step loss metrics to MLflow
mlflow.log_metric("mse_loss", loss.item(), step=step)
loss.backward()
optimizer.step()
total_loss += loss.item() * xb.size(0)
if rank == 0:
# Log average loss metrics
avg_loss = total_loss / len(dataloader.dataset)
mlflow.log_metric("avg_loss", avg_loss)
print(f"[Epoch {epoch}] Avg Loss: {avg_loss:.4f}")
if rank == 0:
# Save checkpoint()
ckpt = {
"MODEL_STATE": model.state_dict(),
"EPOCHS_RUN": epoch,
}
torch.save(ckpt, "/tmp/model.pt")
cleanup()
train.distributed(num_epochs=3, batch_size=64)
Next Steps
Read the Overview to understand Serverless GPU API’s architecture
Check out the API Reference for detailed API documentation
Explore other examples in the repository