Quick Start Guide

This guide will help you get started with Serverless GPU API quickly.

Basic Usage

Here’s a simple example of how to use Serverless GPU API:

from serverless_gpu.launcher import distributed
from serverless_gpu.compute import GPUType
from serverless_gpu import runtime as rt
@distributed(gpus=8, gpu_type=GPUType.A10, remote=True)
def hello_world(name: str):
    if rt.get_local_rank() == 0:
        print('hello', name)
    return rt.get_global_rank()

hello_world.distributed(name='world')

Here’s another example of training a simple MLP model using PyTorch’s Distributed Data Parallel (DDP) module:

import os
import torch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader, DistributedSampler, TensorDataset
import torch.nn as nn
import torch.optim as optim
from serverless_gpu import distributed
import mlflow

def setup():
    dist.init_process_group("nccl")
    torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))

def cleanup():
    dist.destroy_process_group()

class SimpleMLP(nn.Module):
    def __init__(self, input_dim=10, hidden_dim=64, output_dim=1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.net(x)

@distributed(gpus=2, gpu_type='a10', remote=True)
def train(num_epochs=3, batch_size=64) -> None:
    setup()
    rank = dist.get_rank()
    device = torch.device(f"cuda:{int(os.environ['LOCAL_RANK'])}")

    with mlflow.start_run():
        # Model: MLP with hidden layers
        model = SimpleMLP().to(device)
        model = DDP(model, device_ids=[device])

        # Dummy regression dataset
        x = torch.randn(5000, 10)
        y = 3 * x.sum(dim=1, keepdim=True) + torch.randn(5000, 1) * 0.1
        dataset = TensorDataset(x, y)
        sampler = DistributedSampler(dataset)
        dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size)

        optimizer = optim.Adam(model.parameters(), lr=0.001)
        loss_fn = nn.MSELoss()

        for epoch in range(num_epochs):
            sampler.set_epoch(epoch)
            model.train()
            total_loss = 0.0
            for step, (xb, yb) in enumerate(dataloader):
                xb, yb = xb.to(device), yb.to(device)
                optimizer.zero_grad()
                loss = loss_fn(model(xb), yb)
                # Log per-step loss metrics to MLflow
                mlflow.log_metric("mse_loss", loss.item(), step=step)

                loss.backward()
                optimizer.step()
                total_loss += loss.item() * xb.size(0)

            if rank == 0:
                # Log average loss metrics
                avg_loss = total_loss / len(dataloader.dataset)
                mlflow.log_metric("avg_loss", avg_loss)
                print(f"[Epoch {epoch}] Avg Loss: {avg_loss:.4f}")

            if rank == 0:
                #  Save checkpoint()
                ckpt = {
                    "MODEL_STATE": model.state_dict(),
                    "EPOCHS_RUN": epoch,
                }
                torch.save(ckpt, "/tmp/model.pt")

    cleanup()

train.distributed(num_epochs=3, batch_size=64)

Pool Configuration

By default, SkyRun uses reserved GPU pools. To use on-demand GPUs instead, set the environment variable:

import os

# Enable reserved pool usage
os.environ['DATABRICKS_USE_RESERVED_GPU_POOL'] = 'false'

To specify the reserved pool to use, set the environment variable:

import os

# Specify the reserved pool ID
os.environ['DATABRICKS_REMOTE_GPU_POOL_ID'] = 'your-pool-id'

These envrionment varaibles must be set outside of the @distributed call. Once configured, the subsequent @distributed call will use the specified reserved pool.

Ray Integration

Serverless GPU API also supports Ray for distributed computing. Here’s an example using the ray_launch decorator:

from serverless_gpu.ray import ray_launch

@ray_launch(gpus=16, remote=True, gpu_type='h100')
def foo():
    import os
    import ray

    print(ray.state.available_resources_per_node())
    return 1

foo.distributed()

The ray_launch decorator automatically sets up a Ray cluster across all allocated GPUs and provides a Ray environment for your function to use. This is particularly useful for Ray-based distributed machine learning workloads.

Next Steps

  • Read the Overview to understand Serverless GPU API’s architecture

  • Check out the API Reference for detailed API documentation

  • Explore other examples in the repository