Skip to content

AI Hardware Engineer Roadmap

04 libcufile API

ai-hpc/ai-hardware-engineer-roadmap

04 — libcufile Programming API¶

1. Core API Overview¶

libcufile provides three categories of operations:

1. Initialization
   cuFileDriverOpen()       — initialize GDS driver connection
   cuFileDriverClose()      — cleanup

2. File Handles
   cuFileHandleRegister()   — register a POSIX fd with GDS
   cuFileHandleDeregister() — release file handle

3. Buffer Registration
   cuFileBufRegister()      — register GPU memory for DMA
   cuFileBufDeregister()    — release GPU buffer registration

4. I/O Operations
   cuFileRead()             — GPU DMA read from file
   cuFileWrite()            — GPU DMA write to file

5. Batch I/O (async)
   cuFileBatchIOSetUp()     — create batch context
   cuFileBatchIOSubmit()    — submit batch of I/O ops
   cuFileBatchIOGetStatus() — poll for completion
   cuFileBatchIOCancel()    — cancel pending batch
   cuFileBatchIODestroy()   — cleanup batch context

2. Basic GDS Read: File → GPU Memory¶

#include <cufile.h>
#include <cuda_runtime.h>
#include <fcntl.h>
#include <assert.h>

void gds_read_example(const char* filepath, size_t file_size) {

    // === 1. Initialize GDS driver ===
    CUfileError_t status;
    status = cuFileDriverOpen();
    assert(status.err == CU_FILE_SUCCESS);

    // === 2. Open file ===
    int fd = open(filepath, O_RDONLY | O_DIRECT);
    //                                ^^^^^^^^ O_DIRECT required for GDS
    assert(fd >= 0);

    // === 3. Register file handle with GDS ===
    CUfileDescr_t cf_descr = {};
    cf_descr.handle.fd = fd;
    cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;

    CUfileHandle_t cf_handle;
    status = cuFileHandleRegister(&cf_handle, &cf_descr);
    assert(status.err == CU_FILE_SUCCESS);

    // === 4. Allocate GPU buffer ===
    void* d_buf;
    cudaMalloc(&d_buf, file_size);

    // === 5. Register GPU buffer for DMA ===
    // This pins the GPU physical pages for direct DMA access
    status = cuFileBufRegister(d_buf, file_size, 0);
    assert(status.err == CU_FILE_SUCCESS);

    // === 6. Read: file → GPU memory (DMA, no CPU bounce buffer) ===
    ssize_t bytes_read = cuFileRead(
        cf_handle,      // registered file handle
        d_buf,          // destination: GPU buffer (registered)
        file_size,      // bytes to read
        0,              // file offset
        0               // GPU buffer offset
    );
    assert(bytes_read == (ssize_t)file_size);

    // === 7. GPU can now use d_buf directly ===
    myKernel<<<grid, block>>>(d_buf, file_size / sizeof(float));
    cudaDeviceSynchronize();

    // === 8. Cleanup ===
    cuFileBufDeregister(d_buf);
    cuFileHandleDeregister(cf_handle);
    close(fd);
    cudaFree(d_buf);
    cuFileDriverClose();
}

3. Basic GDS Write: GPU Memory → File¶

void gds_write_example(const char* filepath, float* d_output, size_t size) {

    cuFileDriverOpen();

    // Open file for writing with O_DIRECT
    int fd = open(filepath, O_WRONLY | O_CREAT | O_DIRECT, 0644);
    assert(fd >= 0);

    // Register file and GPU buffer (same pattern as read)
    CUfileDescr_t cf_descr = {};
    cf_descr.handle.fd = fd;
    cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;

    CUfileHandle_t cf_handle;
    cuFileHandleRegister(&cf_handle, &cf_descr);
    cuFileBufRegister(d_output, size, 0);

    // Write GPU memory → file (DMA)
    ssize_t bytes_written = cuFileWrite(
        cf_handle,
        d_output,       // source: GPU buffer
        size,           // bytes to write
        0,              // file offset
        0               // GPU buffer offset
    );
    assert(bytes_written == (ssize_t)size);

    cuFileBufDeregister(d_output);
    cuFileHandleDeregister(cf_handle);
    close(fd);
    cuFileDriverClose();
}

4. Alignment Requirements¶

GDS requires 512-byte alignment for all parameters:

// ALL of these must be multiples of 512:
//   file_offset    (byte position in file)
//   gpu_buf_offset (byte offset into GPU buffer)
//   transfer_size  (bytes to transfer)

// GPU buffer allocation must also be aligned:
void* d_buf;
size_t ALIGN = 512;
size_t aligned_size = (size + ALIGN - 1) & ~(ALIGN - 1);  // round up
cudaMalloc(&d_buf, aligned_size);

// For pointer alignment, use cuMemAlloc with alignment specification:
CUdeviceptr d_ptr;
cuMemAlloc(&d_ptr, aligned_size);

// Or use posix_memalign for pinned host buffers:
void* h_buf;
posix_memalign(&h_buf, 4096, aligned_size);  // 4096 for O_DIRECT
cudaHostRegister(h_buf, aligned_size, cudaHostRegisterDefault);

5. Batch I/O — High-Throughput Async Operations¶

Batch I/O submits multiple transfers simultaneously, maximizing NVMe queue depth:

#define BATCH_SIZE 16      // submit 16 I/O ops at once
#define CHUNK_SIZE (4 * 1024 * 1024)  // 4 MB per chunk (512-byte aligned)

void gds_batch_read(CUfileHandle_t* handles, void** gpu_buffers,
                    size_t* sizes, int n_files) {

    // === Setup batch context ===
    CUfileBatchHandle_t batch;
    cuFileBatchIOSetUp(&batch, BATCH_SIZE);

    // === Submit batch of reads ===
    CUfileIOParams_t io_params[BATCH_SIZE];
    for (int i = 0; i < BATCH_SIZE && i < n_files; i++) {
        io_params[i].mode          = CUFILE_BATCH;
        io_params[i].fh            = handles[i];
        io_params[i].u.batch.devPtr_base = gpu_buffers[i];
        io_params[i].u.batch.file_offset = 0;
        io_params[i].u.batch.devPtr_offset = 0;
        io_params[i].u.batch.size  = sizes[i];
        io_params[i].opcode        = CUFILE_READ;
    }

    cuFileBatchIOSubmit(batch, BATCH_SIZE, io_params, 0);

    // === Poll for completion ===
    CUfileIOEvents_t io_events[BATCH_SIZE];
    unsigned completed = 0;
    while (completed < BATCH_SIZE) {
        unsigned nr = BATCH_SIZE;
        CUfileError_t err = cuFileBatchIOGetStatus(batch, completed, &nr, io_events, NULL);
        assert(err.err == CU_FILE_SUCCESS);
        for (unsigned j = 0; j < nr; j++) {
            if (io_events[j].status == CUFILE_COMPLETE) {
                completed++;
            }
        }
    }

    cuFileBatchIODestroy(batch);
}

6. GDS with PyTorch DataLoader¶

The most common production use: loading training data directly into GPU tensors.

# gds_dataset.py
import torch
import cufile  # pip install cufile (NVIDIA Python bindings)
import numpy as np
import os

class GDSDataset(torch.utils.data.Dataset):
    """
    Dataset that loads data directly from NVMe to GPU memory using GDS.
    Zero CPU involvement in the data path.
    """

    def __init__(self, data_dir: str, gpu_id: int = 0):
        self.files = sorted([
            os.path.join(data_dir, f)
            for f in os.listdir(data_dir) if f.endswith(".bin")
        ])
        self.gpu_id = gpu_id
        self.sample_size = 4096 * 512 * 2  # seq=4096, hidden=512, BF16=2 bytes
        # Must be 512-byte aligned ✓ (4096 * 512 * 2 = 4194304)

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx: int) -> torch.Tensor:
        filepath = self.files[idx]

        # Allocate GPU tensor — GDS will fill it directly
        # Shape: [seq_len, hidden_dim], dtype: bfloat16
        tensor = torch.empty(
            4096, 512,
            dtype=torch.bfloat16,
            device=f"cuda:{self.gpu_id}",
        )

        # GDS read: NVMe → GPU (no CPU bounce buffer)
        with cufile.open(filepath, "r") as f:
            f.read(tensor)   # direct DMA into tensor's GPU memory

        return tensor


# Usage with DataLoader
dataset = GDSDataset("/mnt/nvme0/training_data/", gpu_id=0)
loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=32,
    num_workers=4,         # 4 worker processes each doing GDS reads
    pin_memory=False,      # already in GPU memory, no pin needed
    prefetch_factor=2,     # prefetch 2 batches ahead
)

for batch in loader:
    # batch is already on GPU — no .to(device) needed!
    loss = model(batch).loss
    loss.backward()

Lower-Level PyTorch GDS Integration¶

import torch
import ctypes
import os

# Load libcufile manually for fine-grained control
_cufile = ctypes.CDLL("libcufile.so")

def gds_load_tensor(filepath: str, shape: tuple, dtype: torch.dtype, device: str) -> torch.Tensor:
    """Load a binary file directly into a GPU tensor using GDS."""

    # Allocate GPU tensor
    tensor = torch.empty(shape, dtype=dtype, device=device)
    nbytes = tensor.element_size() * tensor.numel()

    # Ensure 512-byte alignment
    assert nbytes % 512 == 0, f"Size {nbytes} not 512-byte aligned"

    # Open file with O_DIRECT (required for GDS)
    fd = os.open(filepath, os.O_RDONLY | os.O_DIRECT)
    try:
        # Register file with GDS
        cf_descr = ...  # CUfileDescr_t via ctypes
        cf_handle = ...
        _cufile.cuFileHandleRegister(ctypes.byref(cf_handle), ctypes.byref(cf_descr))

        # Register GPU buffer
        data_ptr = tensor.data_ptr()
        _cufile.cuFileBufRegister(ctypes.c_void_p(data_ptr), ctypes.c_size_t(nbytes), 0)

        # Read: file → GPU
        _cufile.cuFileRead(cf_handle, ctypes.c_void_p(data_ptr), ctypes.c_size_t(nbytes), 0, 0)

        # Cleanup registrations
        _cufile.cuFileBufDeregister(ctypes.c_void_p(data_ptr))
        _cufile.cuFileHandleDeregister(cf_handle)
    finally:
        os.close(fd)

    return tensor

7. Overlap I/O and Compute with CUDA Streams¶

The key to maximum throughput: load the next batch while processing the current one.

// Double-buffered training: I/O and compute in parallel
void overlap_io_compute(CUfileHandle_t fh, float* model_weights, int steps) {

    cudaStream_t compute_stream, io_stream;
    cudaStreamCreate(&compute_stream);
    cudaStreamCreate(&io_stream);

    // Two GPU buffers: ping-pong
    void* d_buf[2];
    cudaMalloc(&d_buf[0], BATCH_SIZE);
    cudaMalloc(&d_buf[1], BATCH_SIZE);
    cuFileBufRegister(d_buf[0], BATCH_SIZE, 0);
    cuFileBufRegister(d_buf[1], BATCH_SIZE, 0);

    // Pre-load first batch
    cuFileRead(fh, d_buf[0], BATCH_SIZE, 0 * BATCH_SIZE, 0);

    for (int step = 0; step < steps; step++) {
        int cur = step % 2;
        int nxt = (step + 1) % 2;

        // Submit NEXT batch I/O while CURRENT batch computes
        // GDS runs on io_stream independently
        if (step + 1 < steps) {
            cuFileRead(fh, d_buf[nxt], BATCH_SIZE, (step+1) * BATCH_SIZE, 0);
            // Note: cuFileRead is synchronous in cufile 2.x
            // Use cuFileBatchIO for true async overlap
        }

        // Process CURRENT batch on compute_stream
        trainKernel<<<grid, block, 0, compute_stream>>>(
            (float*)d_buf[cur], model_weights, BATCH_SIZE / sizeof(float)
        );
        cudaStreamSynchronize(compute_stream);
    }

    cuFileBufDeregister(d_buf[0]);
    cuFileBufDeregister(d_buf[1]);
    cudaFree(d_buf[0]);
    cudaFree(d_buf[1]);
}

8. Error Handling¶

#include <cufile.h>
#include <string>

std::string cufile_strerror(CUfileError_t err) {
    switch (err.err) {
        case CU_FILE_SUCCESS:              return "Success";
        case CU_FILE_DRIVER_NOT_OPEN:      return "Driver not open — call cuFileDriverOpen()";
        case CU_FILE_DRIVER_INVALID_PROPS: return "Invalid config in /etc/cufile.json";
        case CU_FILE_INVALID_VALUE:        return "Invalid argument (check alignment: 512-byte)";
        case CU_FILE_CUDA_DRIVER_ERROR:    return "CUDA driver error";
        case CU_FILE_IO_NOT_SUPPORTED:     return "GDS not supported on this file (check fs type)";
        case CU_FILE_INVALID_MAPPING_SIZE: return "Buffer size not aligned to 512 bytes";
        default: return "Unknown error: " + std::to_string(err.err);
    }
}

// Usage pattern
CUfileError_t status = cuFileRead(handle, buf, size, 0, 0);
if (status.err != CU_FILE_SUCCESS) {
    fprintf(stderr, "GDS read failed: %s\n", cufile_strerror(status).c_str());
    // Fall back to posix read if needed
    pread(fd, h_staging, size, 0);
    cudaMemcpy(d_buf, h_staging, size, cudaMemcpyHostToDevice);
}

References¶