Source code for cuperiod.core.device

"""GPU device introspection, memory cleanup, and worker-count sizing.

Batch GPU runs are limited by device memory, not cores: each worker process holds its
own plans/kernels and transient per-light-curve buffers. :func:`suggest_gpu_workers`
turns a probed free-memory figure plus a per-worker footprint estimate into a safe
worker count, and :func:`free_gpu_memory` bounds cupy memory-pool growth across a
long-lived batch process.
"""

from __future__ import annotations

import os
from dataclasses import dataclass

from cuperiod.core.backend import cuda_available, ensure_cuda_dll_path, has_module

#: A conservative ceiling on GPU workers. Without NVIDIA MPS, processes time-slice a
#: single CUDA context, so many workers rarely help; this caps a runaway suggestion.
_SANE_MAX_GPU_WORKERS = 16



[docs]
@dataclass(frozen=True)
class GpuInfo:
    """A snapshot of one CUDA device's identity and free memory."""

    device_id: int
    name: str
    total_bytes: int
    free_bytes: int
    mps_enabled: bool

    @property
    def total_gib(self) -> float:
        """Total device memory in GiB."""
        return self.total_bytes / 1024**3

    @property
    def free_gib(self) -> float:
        """Free device memory in GiB."""
        return self.free_bytes / 1024**3

    def __str__(self) -> str:
        mps = "on" if self.mps_enabled else "off"
        return (
            f"GPU {self.device_id}: {self.name} | "
            f"{self.free_gib:.1f}/{self.total_gib:.1f} GiB free | MPS {mps}"
        )




[docs]
def gpu_info(device_id: int = 0) -> GpuInfo | None:
    """Return a :class:`GpuInfo` for ``device_id``, or ``None`` if no GPU is usable.

    Parameters
    ----------
    device_id : int, default 0
        CUDA device ordinal.

    Returns
    -------
    GpuInfo or None
    """
    if not cuda_available():
        return None
    ensure_cuda_dll_path()
    try:
        import cupy

        with cupy.cuda.Device(device_id):
            free, total = cupy.cuda.runtime.memGetInfo()
            props = cupy.cuda.runtime.getDeviceProperties(device_id)
        raw_name = props["name"]
        name = raw_name.decode() if isinstance(raw_name, bytes) else str(raw_name)
        return GpuInfo(
            device_id=device_id,
            name=name,
            total_bytes=int(total),
            free_bytes=int(free),
            mps_enabled="CUDA_MPS_PIPE_DIRECTORY" in os.environ,
        )
    except Exception:
        return None




[docs]
def free_gpu_memory() -> None:
    """Return cupy's cached device + pinned blocks to the driver. No-op off GPU.

    A long-lived batch process allocates many variable-sized device arrays per light
    curve; cupy's default pool retains freed blocks, so across thousands of light
    curves the pool can grow and fragment until a routine allocation stalls. Calling
    this between batch shards bounds pool growth — the GPU analogue of a fresh worker
    teardown. The ``cudaFree`` of cached blocks costs ~milliseconds.
    """
    if not has_module("cupy"):
        return
    try:
        import cupy

        cupy.get_default_memory_pool().free_all_blocks()
        cupy.get_default_pinned_memory_pool().free_all_blocks()
    except Exception:
        return



def _estimate_worker_bytes(method: str, n_points: int) -> int:
    """Per-worker device footprint estimate (bytes) for ``method`` at ``n_points``.

    Defers to the method's own estimate when available, else uses a generic figure:
    a handful of complex128 buffers over the grid plus a fixed plan/context base.
    """
    try:
        from cuperiod.methods.base import get_method

        return int(get_method(method).estimate_device_bytes(n_points))
    except Exception:
        return 64 * 1024**2 + n_points * 16 * 6



[docs]
def suggest_gpu_workers(
    method: str = "GLS",
    *,
    n_points_hint: int = 2000,
    grid_size: int | None = None,
    headroom: float = 0.20,
    device_id: int = 0,
) -> int:
    """Suggest a GPU worker count from probed free memory and a footprint estimate.

    Parameters
    ----------
    method : str, default "GLS"
        Method name, used to pick a per-worker footprint estimate.
    n_points_hint : int, default 2000
        Assumed grid size when ``grid_size`` is not given.
    grid_size : int, optional
        Actual grid size, if known.
    headroom : float, default 0.20
        Fraction of free memory to leave unused (driver/fragmentation slack).
    device_id : int, default 0
        CUDA device ordinal.

    Returns
    -------
    int
        A worker count ``>= 1``. Returns 1 when no GPU is available. Capped at a sane
        maximum; without MPS, 1-2 workers usually saturate a single device.
    """
    info = gpu_info(device_id)
    if info is None:
        return 1
    n_points = int(grid_size if grid_size is not None else n_points_hint)
    per_worker = max(1, _estimate_worker_bytes(method, n_points))
    usable = info.free_bytes * (1.0 - headroom)
    workers = int(usable // per_worker)
    return max(1, min(workers, _SANE_MAX_GPU_WORKERS))



__all__ = [
    "GpuInfo",
    "free_gpu_memory",
    "gpu_info",
    "suggest_gpu_workers",
]