Full Example Specs

This appendix embeds the runnable repository example YAML files directly from examples/.

Some repository examples keep an explicit ${CACHE_DIR:-/cluster/shared/hpc-compose-cache} for portability, while starter examples rely on the settings/builtin cache default. Before running on a real cluster, configure a shared path visible from both the submission host and the compute nodes:

export CACHE_DIR=/cluster/shared/hpc-compose-cache
mkdir -p "$CACHE_DIR"
test -w "$CACHE_DIR"

App Redis Worker

Source: examples/app-redis-worker.yaml

name: redis-demo

x-slurm:
  job_name: redis-demo
  time: "00:15:00"
  mem: 8G
  cpus_per_task: 2
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  redis:
    image: redis:7
    command: redis-server --save "" --appendonly no
    readiness:
      type: tcp
      host: 127.0.0.1
      port: 6379
      timeout_seconds: 30
    x-slurm:
      cpus_per_task: 1

  worker:
    image: redis:7
    depends_on:
      redis:
        condition: service_healthy
    command:
      - /bin/sh
      - -lc
      - |
        redis-cli -h 127.0.0.1 ping
        while true; do
          redis-cli -h 127.0.0.1 incr jobs
          sleep 2
        done
    x-slurm:
      cpus_per_task: 1

Canary Right Size

Source: examples/canary-right-size.yaml

name: canary-right-size

x-slurm:
  job_name: canary-right-size
  partition: gpu
  time: "04:00:00"
  mem: 64G
  gpus: 4
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}
  metrics:
    enabled: true
    interval_seconds: 10

services:
  trainer:
    image: python:3.12-slim
    command:
      - /bin/sh
      - -lc
      - |
        python - <<'PY'
        import time
        data = bytearray(512 * 1024 * 1024)
        print(f"allocated {len(data)} bytes")
        time.sleep(20)
        PY
    x-slurm:
      cpus_per_task: 8

Dev Python App

Source: examples/dev-python-app.yaml

name: dev-python-app

x-slurm:
  job_name: dev-python-app
  time: "00:30:00"
  mem: 8G
  cpus_per_task: 2

services:
  app:
    image: python:3.11-slim
    working_dir: /workspace
    volumes:
      - ./app:/workspace
    command:
      - python
      - -m
      - main
    x-runtime:
      prepare:
        commands:
          - pip install --no-cache-dir fastapi uvicorn openai

Dev Python Smoke

Source: examples/dev-python-smoke.yaml

name: dev-python-smoke

x-slurm:
  job_name: dev-python-smoke
  time: "00:01:00"
  mem: 2G
  cpus_per_task: 1

services:
  app:
    image: python:3.11-slim
    working_dir: /workspace
    volumes:
      - ./app:/workspace
    command:
      - python
      - -c
      - "import main; print('smoke ok', flush=True)"
    x-runtime:
      prepare:
        commands:
          - pip install --no-cache-dir fastapi uvicorn openai

Fairseq Preprocess

Source: examples/fairseq-preprocess.yaml

name: fairseq-preprocess

x-slurm:
  job_name: fairseq-preprocess
  time: "02:00:00"
  mem: 32G
  cpus_per_task: 8
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  preprocess:
    image: python:3.11-slim
    volumes:
      - /shared/$USER/data/raw:/data/raw
      - /shared/$USER/data/processed:/data/processed
    environment:
      INPUT_DIR: /data/raw
      OUTPUT_DIR: /data/processed
      NUM_WORKERS: "8"
    command:
      - /bin/sh
      - -lc
      - |
        python -c "
        import os, json, hashlib, multiprocessing
        from pathlib import Path
        from concurrent.futures import ProcessPoolExecutor

        input_dir = Path(os.environ['INPUT_DIR'])
        output_dir = Path(os.environ['OUTPUT_DIR'])
        num_workers = int(os.environ['NUM_WORKERS'])
        output_dir.mkdir(parents=True, exist_ok=True)

        files = sorted(input_dir.glob('*.txt'))
        if not files:
            print(f'No .txt files found in {input_dir}')
            exit(1)
        print(f'Found {len(files)} input files')

        def process_file(path):
            text = path.read_text(encoding='utf-8', errors='replace')
            lines = [l.strip() for l in text.splitlines() if l.strip()]
            tokens = []
            for line in lines:
                tokens.extend(line.lower().split())
            out = output_dir / f'{path.stem}.jsonl'
            with open(out, 'w') as f:
                for i, line in enumerate(lines):
                    record = {
                        'id': f'{path.stem}_{i}',
                        'text': line,
                        'tokens': len(line.split()),
                    }
                    f.write(json.dumps(record) + '\n')
            return path.name, len(lines), len(tokens)

        with ProcessPoolExecutor(max_workers=num_workers) as pool:
            results = list(pool.map(process_file, files))

        total_lines = sum(r[1] for r in results)
        total_tokens = sum(r[2] for r in results)
        for name, lines, tokens in results:
            print(f'  {name}: {lines} lines, {tokens} tokens')
        print(f'Total: {total_lines} lines, {total_tokens} tokens across {len(files)} files')

        manifest = {
            'files': len(files),
            'total_lines': total_lines,
            'total_tokens': total_tokens,
        }
        (output_dir / 'manifest.json').write_text(json.dumps(manifest, indent=2))
        print('Preprocessing complete')
        "
    x-slurm:
      cpus_per_task: 8

HF Stage Model

Source: examples/hf-stage-model.yaml

name: hf-stage-model

# Stage a pinned HuggingFace model into the job, then serve it.
#
# The download runs INSIDE the Slurm allocation (the compute node has network),
# never on your laptop or over SSH. hpc-compose renders a guarded
# `huggingface-cli download ... --revision <sha> --local-dir <cas-path>` step
# into the batch script and reuses the content-addressed copy on repeat runs.
#
# The revision MUST be an immutable pin (a commit SHA or an explicit immutable
# tag); floating refs like `main` are rejected at validation time so the job is
# reproducible. Set HF_TOKEN in the JOB environment for gated repos — it is
# imported at runtime by huggingface-cli and never written into the script.
x-slurm:
  job_name: hf-stage-model
  time: "02:00:00"
  gpus_per_node: 1
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}
  stage_in:
    - to: /models/llama-3.1-8b
      hf:
        repo: meta-llama/Llama-3.1-8B
        revision: 0e9e39f249a16976918f6564b8830bc894c89659
        kind: model

services:
  server:
    image: vllm/vllm-openai:v0.6.3
    command:
      - /bin/sh
      - -lc
      - |
        python -m vllm.entrypoints.openai.api_server \
          --model /models/llama-3.1-8b \
          --host 0.0.0.0 \
          --port 8000
    readiness:
      type: sleep
      seconds: 5
    x-slurm:
      gpus_per_node: 1

Jupyter

Source: examples/jupyter.yaml

name: jupyter

x-slurm:
  job_name: jupyter
  time: "08:00:00"
  mem: 16G
  cpus_per_task: 4
  gpus: 1
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  notebook:
    image: jupyter/scipy-notebook:latest
    working_dir: /workspace
    volumes:
      - ./project:/workspace
    command:
      - jupyter
      - lab
      - --no-browser
      - --ip=0.0.0.0
      - --port
      - "8888"
      - --ServerApp.token
      - ${JUPYTER_TOKEN:-change-me}
      - --ServerApp.allow_remote_access
      - "True"
    readiness:
      type: log
      pattern: '/lab\?token='

Llama App

Source: examples/llama-app.yaml

name: llama-stack

x-slurm:
  job_name: llama-stack
  time: "02:00:00"
  mem: 32G
  cpus_per_task: 8
  gpus: 1
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  llama:
    image: ghcr.io/ggml-org/llama.cpp:server-cuda
    volumes:
      - ./models:/models
    command:
      - /bin/sh
      - -lc
      - exec /app/llama-server -m /models/model.gguf --host 0.0.0.0 --port 8080
    readiness:
      type: tcp
      host: 127.0.0.1
      port: 8080
      timeout_seconds: 60
    x-slurm:
      gpus: 1
      cpus_per_task: 4

  app:
    image: python:3.11-slim
    depends_on:
      llama:
        condition: service_healthy
    working_dir: /workspace
    volumes:
      - ./app:/workspace
    environment:
      LLM_BASE_URL: http://127.0.0.1:8080/v1
    command:
      - python
      - -m
      - main
    x-runtime:
      prepare:
        commands:
          - pip install --no-cache-dir openai fastapi uvicorn
    x-slurm:
      cpus_per_task: 2

Llama UV Worker

Source: examples/llama-uv-worker.yaml

name: llama-uv-worker

x-slurm:
  job_name: llama-uv-worker
  time: "01:00:00"
  mem: 32G
  cpus_per_task: 8
  gpus: 1
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  llama:
    image: ghcr.io/ggml-org/llama.cpp:server-cuda
    environment:
      GGUF_MODEL_PATH: /models/model.gguf
    volumes:
      - ./models:/models
    command:
      - /bin/sh
      - -lc
      - |
        set -eu
        rm -f /hpc-compose/job/request.done
        /app/llama-server -m "$$GGUF_MODEL_PATH" --host 0.0.0.0 --port 8080 &
        server_pid=$$!
        while [ ! -f /hpc-compose/job/request.done ]; do
          if ! kill -0 "$$server_pid" 2>/dev/null; then
            wait "$$server_pid"
            exit $$?
          fi
          sleep 1
        done
        kill "$$server_pid" 2>/dev/null || true
        wait "$$server_pid" || true
    readiness:
      type: log
      pattern: "main: model loaded"
      timeout_seconds: 300
    x-slurm:
      gpus: 1
      cpus_per_task: 4

  worker:
    image: python:3.11-slim
    working_dir: /workspace
    volumes:
      - ./llama-uv-worker:/workspace
    depends_on:
      llama:
        condition: service_healthy
    environment:
      OPENAI_BASE_URL: http://127.0.0.1:8080/v1
      MODEL_NAME: local-model
      REQUEST_DONE_PATH: /hpc-compose/job/request.done
    command:
      - /bin/sh
      - -lc
      - |
        set -eu
        UV_CACHE_DIR=/hpc-compose/job/.uv-cache uv run worker.py
    x-runtime:
      prepare:
        commands:
          - pip install --no-cache-dir uv
    x-slurm:
      cpus_per_task: 2

LLM Curl Workflow

Source: examples/llm-curl-workflow.yaml

name: llm-curl-workflow

x-slurm:
  job_name: llm-curl-workflow
  time: "00:30:00"
  mem: 32G
  cpus_per_task: 8
  gpus: 1
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  llm:
    image: ghcr.io/ggml-org/llama.cpp:server-cuda
    volumes:
      - ./models:/models
    command:
      - /bin/sh
      - -lc
      - |
        set -eu
        rm -f /hpc-compose/job/request.done
        /app/llama-server -m /models/model.gguf --host 0.0.0.0 --port 8080 &
        server_pid=$$!
        while [ ! -f /hpc-compose/job/request.done ]; do
          if ! kill -0 "$$server_pid" 2>/dev/null; then
            wait "$$server_pid"
            exit $$?
          fi
          sleep 1
        done
        kill "$$server_pid" 2>/dev/null || true
        wait "$$server_pid" || true
    readiness:
      type: log
      pattern: "main: model loaded"
      timeout_seconds: 300
    x-slurm:
      gpus: 1
      cpus_per_task: 4

  curl_client:
    image: debian:bookworm-slim
    depends_on:
      llm:
        condition: service_healthy
    environment:
      LLM_BASE_URL: http://127.0.0.1:8080
    command:
      - /bin/sh
      - -lc
      - |
        set -eu
        cat >/tmp/request.json <<'JSON'
        {
          "model": "local-model",
          "messages": [
            {
              "role": "system",
              "content": "You are a concise assistant."
            },
            {
              "role": "user",
              "content": "Explain what readiness checks do in one sentence."
            }
          ],
          "temperature": 0.2,
          "max_tokens": 64
        }
        JSON
        echo "Sending test request to $$LLM_BASE_URL/v1/chat/completions"
        curl --fail --show-error --silent \
          -H 'Content-Type: application/json' \
          --data @/tmp/request.json \
          "$$LLM_BASE_URL/v1/chat/completions"
        touch /hpc-compose/job/request.done
    x-runtime:
      prepare:
        commands:
          - apt-get update
          - apt-get install -y --no-install-recommends bash ca-certificates curl
          - rm -rf /var/lib/apt/lists/*
    x-slurm:
      cpus_per_task: 1

LLM Curl Workflow Workdir

Source: examples/llm-curl-workflow-workdir.yaml

name: llm-curl-workflow

x-slurm:
  job_name: llm-curl-workflow
  time: "00:30:00"
  mem: 32G
  cpus_per_task: 8
  gpus: 1
  # Uncomment if your cluster requires them.
  # partition: gpu
  # account: my-project
  # Set CACHE_DIR to a path visible from the submission host and compute nodes.
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  llm:
    image: ghcr.io/ggml-org/llama.cpp:server-cuda
    environment:
      MODEL_FILE: model.gguf
    volumes:
      - $HOME/models:/models
    command:
      - /bin/sh
      - -lc
      - |
        set -eu
        rm -f /hpc-compose/job/request.done
        /app/llama-server -m /models/$$MODEL_FILE --host 0.0.0.0 --port 8080 &
        server_pid=$$!
        while [ ! -f /hpc-compose/job/request.done ]; do
          if ! kill -0 "$$server_pid" 2>/dev/null; then
            wait "$$server_pid"
            exit $$?
          fi
          sleep 1
        done
        kill "$$server_pid" 2>/dev/null || true
        wait "$$server_pid" || true
    readiness:
      type: log
      pattern: "main: model loaded"
      timeout_seconds: 300
    x-slurm:
      gpus: 1
      cpus_per_task: 4

  curl_client:
    image: debian:bookworm-slim
    depends_on:
      llm:
        condition: service_healthy
    environment:
      LLM_BASE_URL: http://127.0.0.1:8080
    command:
      - /bin/sh
      - -lc
      - |
        set -eu
        cat >/tmp/request.json <<'JSON'
        {
          "model": "local-model",
          "messages": [
            {
              "role": "system",
              "content": "You are a concise assistant."
            },
            {
              "role": "user",
              "content": "Explain what readiness checks do in one sentence."
            }
          ],
          "temperature": 0.2,
          "max_tokens": 64
        }
        JSON
        echo "Sending test request to $$LLM_BASE_URL/v1/chat/completions"
        curl --fail --show-error --silent \
          -H 'Content-Type: application/json' \
          --data @/tmp/request.json \
          "$$LLM_BASE_URL/v1/chat/completions"
        touch /hpc-compose/job/request.done
    x-runtime:
      prepare:
        commands:
          - apt-get update
          - apt-get install -y --no-install-recommends bash ca-certificates curl
          - rm -rf /var/lib/apt/lists/*
    x-slurm:
      cpus_per_task: 1

Minimal Batch

Source: examples/minimal-batch.yaml

name: minimal-batch

x-slurm:
  job_name: minimal-batch
  time: "00:10:00"
  mem: 4G
  cpus_per_task: 2

services:
  app:
    image: python:3.11-slim
    command: python -c "print('Hello from Slurm!')"

MPI Hello

Source: examples/mpi-hello.yaml

name: mpi-hello

x-slurm:
  job_name: mpi-hello
  time: "00:15:00"
  mem: 8G
  cpus_per_task: 4
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  mpi:
    image: debian:bookworm-slim
    command:
      - /bin/sh
      - -lc
      - /usr/local/bin/mpi_hello
    x-runtime:
      prepare:
        commands:
          - apt-get update
          - apt-get install -y --no-install-recommends libopenmpi-dev openmpi-bin gcc
          - |
            cat > /tmp/hello.c << 'EOF'
            #include <mpi.h>
            #include <stdio.h>
            int main(int argc, char **argv) {
                MPI_Init(&argc, &argv);
                int rank, size;
                MPI_Comm_rank(MPI_COMM_WORLD, &rank);
                MPI_Comm_size(MPI_COMM_WORLD, &size);
                printf("Hello from rank %d of %d\n", rank, size);
                MPI_Finalize();
                return 0;
            }
            EOF
            mpicc /tmp/hello.c -o /usr/local/bin/mpi_hello
          - rm -rf /var/lib/apt/lists/* /tmp/hello.c
    x-slurm:
      ntasks: 4
      cpus_per_task: 4
      mpi:
        type: pmix
        profile: openmpi
        implementation: openmpi

MPI PMIx v4 Host MPI

Source: examples/mpi-pmix-v4-host-mpi.yaml

name: mpi-pmix-v4-host-mpi

runtime:
  backend: pyxis

x-slurm:
  job_name: mpi-pmix-v4-host-mpi
  time: "00:20:00"
  nodes: 2
  ntasks_per_node: 2
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  mpi:
    image: debian:bookworm-slim
    command:
      - /bin/sh
      - -lc
      - |
        echo "mpi_type=$$HPC_COMPOSE_MPI_TYPE"
        echo "hostfile=$$HPC_COMPOSE_MPI_HOSTFILE"
        cat "$$HPC_COMPOSE_MPI_HOSTFILE"
        /opt/site/openmpi/bin/mpirun --version || true
    x-slurm:
      nodes: 2
      ntasks_per_node: 2
      mpi:
        type: pmix_v4
        profile: openmpi
        implementation: openmpi
        launcher: srun
        expected_ranks: 4
        host_mpi:
          bind_paths:
            - /opt/site/openmpi:/opt/site/openmpi:ro
          env:
            MPI_HOME: /opt/site/openmpi

Multi Node MPI

Source: examples/multi-node-mpi.yaml

name: multi-node-mpi

x-slurm:
  job_name: multi-node-mpi
  time: "00:20:00"
  nodes: 2
  ntasks_per_node: 2
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  bootstrap:
    image: alpine:3.20
    command:
      - /bin/sh
      - -lc
      - |
        echo "primary=$(cat /hpc-compose/job/allocation/primary_node)"
        sleep 30
    readiness:
      type: sleep
      seconds: 1
    x-slurm:
      nodes: 1

  mpi:
    image: python:3.11-slim
    depends_on:
      bootstrap:
        condition: service_healthy
    command:
      - /bin/sh
      - -lc
      - |
        echo "primary=$(cat /hpc-compose/job/allocation/primary_node)"
        echo "nodes=$(tr '\n' ' ' < /hpc-compose/job/allocation/nodes.txt)"
        echo "mpi_hostfile=$$HPC_COMPOSE_MPI_HOSTFILE"
        cat "$$HPC_COMPOSE_MPI_HOSTFILE"
        python - <<'PY'
        import os
        print("mpi placeholder")
        print("node_count", os.environ["HPC_COMPOSE_NODE_COUNT"])
        print("mpi_type", os.environ["HPC_COMPOSE_MPI_TYPE"])
        PY
    readiness:
      type: sleep
      seconds: 2
    x-slurm:
      nodes: 2
      ntasks_per_node: 2
      mpi:
        type: pmix
        profile: openmpi
        implementation: openmpi
        launcher: srun
        expected_ranks: 4

Multi Node Partitioned

Source: examples/multi-node-partitioned.yaml

name: multi-node-partitioned

x-slurm:
  job_name: multi-node-partitioned
  time: "00:20:00"
  nodes: 8
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  service-a:
    image: alpine:3.20
    command:
      - /bin/sh
      - -lc
      - |
        echo "service-a nodes=$$HPC_COMPOSE_SERVICE_NODELIST"
        sleep 30
    readiness:
      type: sleep
      seconds: 1
    x-slurm:
      placement:
        node_range: "0-3"

  service-b:
    image: alpine:3.20
    command:
      - /bin/sh
      - -lc
      - |
        echo "service-b nodes=$$HPC_COMPOSE_SERVICE_NODELIST"
        sleep 30
    readiness:
      type: sleep
      seconds: 1
    x-slurm:
      placement:
        node_range: "4-7"

  parameter-server:
    image: alpine:3.20
    depends_on:
      service-b:
        condition: service_healthy
    command:
      - /bin/sh
      - -lc
      - |
        echo "co-located with service-b on $$HPC_COMPOSE_SERVICE_NODELIST"
        sleep 30
    readiness:
      type: sleep
      seconds: 1
    x-slurm:
      placement:
        share_with: service-b

  monitor:
    image: alpine:3.20
    command:
      - /bin/sh
      - -lc
      - |
        echo "monitor nodes=$$HPC_COMPOSE_SERVICE_NODELIST"
        sleep 30
    x-slurm:
      placement:
        node_percent: 25
        allow_overlap: true

Multi Node Torchrun

Source: examples/multi-node-torchrun.yaml

name: multi-node-torchrun

x-slurm:
  job_name: multi-node-torchrun
  time: "04:00:00"
  nodes: 2
  gpus_per_node: 4
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  trainer:
    image: pytorch/pytorch:2.12.1-cuda13.2-cudnn9-runtime
    command:
      - /bin/sh
      - -lc
      - |
        echo "master=$$HPC_COMPOSE_DIST_MASTER_ADDR"
        echo "nodes=$$HPC_COMPOSE_SERVICE_NODELIST"
        echo "node_rank=$$HPC_COMPOSE_DIST_NODE_RANK"
        torchrun \
          --nnodes="$$HPC_COMPOSE_DIST_NNODES" \
          --nproc-per-node="$$HPC_COMPOSE_DIST_NPROC_PER_NODE" \
          --node-rank="$$HPC_COMPOSE_DIST_NODE_RANK" \
          --rdzv-backend=c10d \
          --rdzv-endpoint="$$HPC_COMPOSE_DIST_RDZV_ENDPOINT" \
          train.py
    readiness:
      type: sleep
      seconds: 5
    x-slurm:
      nodes: 2
      ntasks_per_node: 1
      gpus_per_node: 4

Multi Node Deepspeed

Source: examples/multi-node-deepspeed.yaml

name: multi-node-deepspeed

x-slurm:
  job_name: multi-node-deepspeed
  time: "04:00:00"
  nodes: 2
  gpus_per_node: 4
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  trainer:
    image: pytorch/pytorch:2.12.1-cuda13.2-cudnn9-runtime
    command:
      - /bin/sh
      - -lc
      - |
        echo "master=$$HPC_COMPOSE_DIST_MASTER_ADDR"
        echo "nodes=$$HPC_COMPOSE_SERVICE_NODELIST"
        echo "node_rank=$$HPC_COMPOSE_DIST_NODE_RANK"
        deepspeed \
          --no_ssh \
          --hostfile "$$HPC_COMPOSE_DIST_HOSTFILE" \
          --num_nodes "$$HPC_COMPOSE_DIST_NNODES" \
          --num_gpus "$$HPC_COMPOSE_DIST_NPROC_PER_NODE" \
          --node_rank "$$HPC_COMPOSE_DIST_NODE_RANK" \
          --master_addr "$$HPC_COMPOSE_DIST_MASTER_ADDR" \
          --master_port "$$HPC_COMPOSE_DIST_MASTER_PORT" \
          train.py
    readiness:
      type: sleep
      seconds: 5
    x-slurm:
      nodes: 2
      ntasks_per_node: 1
      gpus_per_node: 4

Multi Node Accelerate

Source: examples/multi-node-accelerate.yaml

name: multi-node-accelerate

x-slurm:
  job_name: multi-node-accelerate
  time: "04:00:00"
  nodes: 2
  gpus_per_node: 4
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  trainer:
    image: pytorch/pytorch:2.12.1-cuda13.2-cudnn9-runtime
    command:
      - /bin/sh
      - -lc
      - |
        echo "master=$$HPC_COMPOSE_DIST_MASTER_ADDR"
        echo "nodes=$$HPC_COMPOSE_SERVICE_NODELIST"
        echo "machine_rank=$$HPC_COMPOSE_DIST_NODE_RANK"
        accelerate launch \
          --multi_gpu \
          --num_machines "$$HPC_COMPOSE_DIST_NNODES" \
          --num_processes "$$HPC_COMPOSE_DIST_WORLD_SIZE" \
          --machine_rank "$$HPC_COMPOSE_DIST_NODE_RANK" \
          --main_process_ip "$$HPC_COMPOSE_DIST_MASTER_ADDR" \
          --main_process_port "$$HPC_COMPOSE_DIST_MASTER_PORT" \
          train.py
    readiness:
      type: sleep
      seconds: 5
    x-slurm:
      nodes: 2
      ntasks_per_node: 1
      gpus_per_node: 4

Multi Node Horovod

Source: examples/multi-node-horovod.yaml

name: multi-node-horovod

x-slurm:
  job_name: multi-node-horovod
  time: "04:00:00"
  nodes: 2
  gpus_per_node: 4
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  trainer:
    image: horovod/horovod:latest
    command:
      - /bin/sh
      - -lc
      - |
        echo "rank=$$SLURM_PROCID local_rank=$$SLURM_LOCALID world=$$SLURM_NTASKS"
        python train_horovod.py
    readiness:
      type: sleep
      seconds: 5
    x-slurm:
      nodes: 2
      ntasks_per_node: 4
      gpus_per_node: 4
      mpi:
        type: pmix
        profile: openmpi
        expected_ranks: 8

Multi Node Jax

Source: examples/multi-node-jax.yaml

name: multi-node-jax

x-slurm:
  job_name: multi-node-jax
  time: "04:00:00"
  nodes: 2
  gpus_per_node: 4
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  trainer:
    image: jaxai/jax:latest
    command:
      - /bin/sh
      - -lc
      - |
        echo "coordinator=$$HPC_COMPOSE_DIST_RDZV_ENDPOINT"
        echo "process_id=$$HPC_COMPOSE_DIST_NODE_RANK processes=$$HPC_COMPOSE_DIST_NNODES"
        python train_jax.py
    readiness:
      type: sleep
      seconds: 5
    x-slurm:
      nodes: 2
      ntasks_per_node: 1
      gpus_per_node: 4

NCCL Tests

Source: examples/nccl-tests.yaml

name: nccl-tests

x-slurm:
  job_name: nccl-tests
  time: "00:30:00"
  nodes: 2
  gpus_per_node: 4
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  all-reduce:
    image: nvcr.io/nvidia/pytorch:24.08-py3
    command:
      - /bin/sh
      - -lc
      - |
        echo "rank=$$SLURM_PROCID local_rank=$$SLURM_LOCALID world=$$SLURM_NTASKS"
        if command -v all_reduce_perf >/dev/null 2>&1; then
          all_reduce_perf -b 8 -e 4G -f 2 -g 1
        elif [ -x /workspace/nccl-tests/build/all_reduce_perf ]; then
          /workspace/nccl-tests/build/all_reduce_perf -b 8 -e 4G -f 2 -g 1
        else
          echo "all_reduce_perf not found; use an image with nccl-tests installed" >&2
          exit 127
        fi
    readiness:
      type: sleep
      seconds: 2
    x-slurm:
      nodes: 2
      ntasks_per_node: 4
      gpus_per_node: 4
      mpi:
        type: pmix
        profile: openmpi
        expected_ranks: 8

Ray Symmetric

Source: examples/ray-symmetric.yaml

name: ray-symmetric

x-slurm:
  job_name: ray-symmetric
  time: "02:00:00"
  nodes: 2
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  ray:
    image: rayproject/ray:2.49.0-py310
    command:
      - /bin/sh
      - -lc
      - |
        ray symmetric-run \
          --address "$$HPC_COMPOSE_DIST_RDZV_ENDPOINT" \
          --min-nodes "$$HPC_COMPOSE_DIST_NNODES" \
          -- \
          python app.py
    readiness:
      type: sleep
      seconds: 10
    x-slurm:
      nodes: 2
      ntasks_per_node: 1

Rendezvous Client

Source: examples/rendezvous-client.yaml

name: rendezvous-client

x-slurm:
  job_name: model-client
  time: "00:10:00"
  mem: 2G
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}
  rendezvous: model-server

services:
  client:
    image: curlimages/curl:8.10.1
    command:
      - /bin/sh
      - -lc
      - |
        curl -fsS "$${HPC_COMPOSE_RDZV_MODEL_SERVER_URL}"

Rendezvous Model Server

Source: examples/rendezvous-model-server.yaml

name: rendezvous-model-server

x-slurm:
  job_name: model-server
  partition: gpu
  time: "02:00:00"
  mem: 32G
  gpus: 1
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  model:
    image: python:3.12-slim
    command:
      - /bin/sh
      - -lc
      - |
        python -m http.server 8000
    readiness:
      type: tcp
      port: 8000
      timeout_seconds: 60
    x-slurm:
      rendezvous:
        register:
          name: model-server
          port: 8000
          protocol: http
          path: /
          ttl_seconds: 3600

Ray Head Workers

Source: examples/ray-head-workers.yaml

name: ray-head-workers

x-slurm:
  job_name: ray-head-workers
  time: "02:00:00"
  nodes: 2
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  head:
    image: rayproject/ray:2.49.0-py310
    command:
      - /bin/sh
      - -lc
      - |
        ray start --head --node-ip-address="$$HPC_COMPOSE_SERVICE_PRIMARY_NODE" --port=6379 --block
    readiness:
      type: sleep
      seconds: 10
    x-slurm:
      nodes: 1

  worker:
    image: rayproject/ray:2.49.0-py310
    command:
      - /bin/sh
      - -lc
      - |
        ray start --address="$$HPC_COMPOSE_PRIMARY_NODE:6379" --block
    depends_on:
      head:
        condition: service_healthy
    x-slurm:
      nodes: 1
      placement:
        node_range: "1"

Dask Scheduler Workers

Source: examples/dask-scheduler-workers.yaml

name: dask-scheduler-workers

x-slurm:
  job_name: dask-scheduler-workers
  time: "02:00:00"
  nodes: 2
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  scheduler:
    image: ghcr.io/dask/dask:latest
    command:
      - /bin/sh
      - -lc
      - |
        dask scheduler --host "$$HPC_COMPOSE_SERVICE_PRIMARY_NODE" --port 8786
    readiness:
      type: tcp
      host: 127.0.0.1
      port: 8786
      timeout_seconds: 60
    x-slurm:
      nodes: 1

  workers:
    image: ghcr.io/dask/dask:latest
    command:
      - /bin/sh
      - -lc
      - |
        dask worker "tcp://$$HPC_COMPOSE_PRIMARY_NODE:8786"
    depends_on:
      scheduler:
        condition: service_healthy
    x-slurm:
      nodes: 2
      ntasks_per_node: 1

Spark Standalone

Source: examples/spark-standalone.yaml

name: spark-standalone

x-slurm:
  job_name: spark-standalone
  time: "02:00:00"
  nodes: 2
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  master:
    image: apache/spark:3.5.3
    command:
      - /bin/sh
      - -lc
      - |
        /opt/spark/sbin/start-master.sh --host "$$HPC_COMPOSE_SERVICE_PRIMARY_NODE" --port 7077
        tail -f /opt/spark/logs/*
    readiness:
      type: tcp
      host: 127.0.0.1
      port: 7077
      timeout_seconds: 60
    x-slurm:
      nodes: 1

  workers:
    image: apache/spark:3.5.3
    command:
      - /bin/sh
      - -lc
      - |
        /opt/spark/sbin/start-worker.sh "spark://$$HPC_COMPOSE_PRIMARY_NODE:7077"
        tail -f /opt/spark/logs/*
    depends_on:
      master:
        condition: service_healthy
    x-slurm:
      nodes: 2
      ntasks_per_node: 1

  app:
    image: apache/spark:3.5.3
    command:
      - /bin/sh
      - -lc
      - |
        spark-submit --master "spark://$$HPC_COMPOSE_PRIMARY_NODE:7077" app.py
    depends_on:
      master:
        condition: service_healthy
    x-slurm:
      nodes: 1

Flux Nested

Source: examples/flux-nested.yaml

name: flux-nested

runtime:
  backend: host

x-slurm:
  job_name: flux-nested
  time: "01:00:00"
  nodes: 2
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  flux:
    command:
      - /bin/sh
      - -lc
      - |
        flux start bash -lc 'flux run --label-io -N "$$HPC_COMPOSE_DIST_NNODES" hostname'
    x-slurm:
      nodes: 2
      ntasks_per_node: 1

Nextflow Bridge

Source: examples/nextflow-bridge.yaml

name: nextflow-bridge

runtime:
  backend: host

x-slurm:
  job_name: nextflow-bridge
  time: "02:00:00"
  nodes: 1
  cpus_per_task: 8
  mem: 16G
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}
  artifacts:
    export_dir: ./results/${SLURM_JOB_ID}
    paths:
      - /hpc-compose/job/nextflow-work/**
      - /hpc-compose/job/reports/**
      - /hpc-compose/job/logs/**

services:
  nextflow:
    command:
      - /bin/sh
      - -lc
      - |
        # Write under HPC_COMPOSE_JOB_DIR (the portable per-job scratch path) so
        # this spec runs on both the container and host backends; artifacts below
        # are declared with the equivalent /hpc-compose/job/** convention.
        mkdir -p "$$HPC_COMPOSE_JOB_DIR/nextflow-work" "$$HPC_COMPOSE_JOB_DIR/reports"
        nextflow run "$${NEXTFLOW_PIPELINE:-main.nf}" \
          -work-dir "$$HPC_COMPOSE_JOB_DIR/nextflow-work" \
          -with-report "$$HPC_COMPOSE_JOB_DIR/reports/report.html" \
          -with-trace "$$HPC_COMPOSE_JOB_DIR/reports/trace.txt" \
          $${NEXTFLOW_ARGS:-}
    environment:
      NEXTFLOW_PIPELINE: main.nf
      NEXTFLOW_ARGS: ""
    x-slurm:
      ntasks: 1

Snakemake Bridge

Source: examples/snakemake-bridge.yaml

name: snakemake-bridge

runtime:
  backend: host

x-slurm:
  job_name: snakemake-bridge
  time: "02:00:00"
  nodes: 1
  cpus_per_task: 8
  mem: 16G
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}
  artifacts:
    export_dir: ./results/${SLURM_JOB_ID}
    paths:
      - /hpc-compose/job/snakemake-work/**
      - /hpc-compose/job/reports/**
      - /hpc-compose/job/logs/**

services:
  snakemake:
    command:
      - /bin/sh
      - -lc
      - |
        # Write under HPC_COMPOSE_JOB_DIR (the portable per-job scratch path) so
        # this spec runs on both the container and host backends; the artifacts
        # below are declared with the equivalent /hpc-compose/job/** convention.
        mkdir -p "$$HPC_COMPOSE_JOB_DIR/snakemake-work" "$$HPC_COMPOSE_JOB_DIR/reports"
        snakemake \
          --snakefile "$${SNAKEMAKE_FILE:-Snakefile}" \
          --cores "$${SNAKEMAKE_CORES:-$${SLURM_CPUS_PER_TASK:-1}}" \
          --directory "$${SNAKEMAKE_WORKDIR:-$$HPC_COMPOSE_JOB_DIR/snakemake-work}" \
          --printshellcmds \
          $${SNAKEMAKE_ARGS:-}
    environment:
      SNAKEMAKE_FILE: Snakefile
      SNAKEMAKE_ARGS: ""
    x-slurm:
      ntasks: 1

Multi Stage Pipeline

Source: examples/multi-stage-pipeline.yaml

name: multi-stage-pipeline

x-slurm:
  job_name: multi-stage-pipeline
  time: "00:30:00"
  mem: 8G
  cpus_per_task: 4
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  producer:
    image: python:3.11-slim
    command:
      - /bin/sh
      - -lc
      - |
        python -c "
        import csv, random, os

        output = '/hpc-compose/job/output.csv'
        with open(output, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['id', 'value', 'category'])
            for i in range(1000):
                writer.writerow([i, round(random.gauss(50, 15), 2), random.choice(['A', 'B', 'C'])])

        print(f'Wrote 1000 rows to {output}')
        print('producer complete')
        "
    readiness:
      type: log
      pattern: "producer complete"
      timeout_seconds: 60
    x-slurm:
      cpus_per_task: 1

  consumer:
    image: python:3.11-slim
    depends_on:
      producer:
        condition: service_healthy
    command:
      - /bin/sh
      - -lc
      - |
        python -c "
        import csv, collections

        with open('/hpc-compose/job/output.csv') as f:
            reader = csv.DictReader(f)
            rows = list(reader)

        by_cat = collections.defaultdict(list)
        for row in rows:
            by_cat[row['category']].append(float(row['value']))

        print(f'Read {len(rows)} rows')
        for cat in sorted(by_cat):
            vals = by_cat[cat]
            print(f'  {cat}: count={len(vals)}, mean={sum(vals)/len(vals):.2f}')

        print('consumer complete')
        "
    x-slurm:
      cpus_per_task: 1

Pipeline DAG

Source: examples/pipeline-dag.yaml

name: pipeline-dag

x-slurm:
  job_name: pipeline-dag
  time: "00:20:00"
  mem: 4G
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  preprocess:
    image: alpine:3.20
    command:
      - /bin/sh
      - -lc
      - |
        mkdir -p /hpc-compose/job/pipeline
        printf 'records=3\n' > /hpc-compose/job/pipeline/prepared.txt

  train:
    image: alpine:3.20
    depends_on:
      preprocess:
        condition: service_completed_successfully
    command:
      - /bin/sh
      - -lc
      - |
        cat /hpc-compose/job/pipeline/prepared.txt
        printf 'accuracy=0.91\n' > /hpc-compose/job/pipeline/model.txt

  postprocess:
    image: alpine:3.20
    depends_on:
      train:
        condition: service_completed_successfully
    command:
      - /bin/sh
      - -lc
      - |
        cat /hpc-compose/job/pipeline/model.txt
        printf 'done\n' > /hpc-compose/job/pipeline/report.txt

Postgres ETL

Source: examples/postgres-etl.yaml

name: postgres-etl

x-slurm:
  job_name: postgres-etl
  time: "01:00:00"
  mem: 16G
  cpus_per_task: 4
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  postgres:
    image: postgres:16
    environment:
      POSTGRES_USER: etl
      POSTGRES_PASSWORD: etl
      POSTGRES_DB: pipeline
    readiness:
      type: tcp
      host: 127.0.0.1
      port: 5432
      timeout_seconds: 30
    x-slurm:
      cpus_per_task: 2

  etl:
    image: python:3.11-slim
    depends_on:
      postgres:
        condition: service_healthy
    environment:
      DATABASE_URL: postgresql://etl:etl@127.0.0.1:5432/pipeline
    command:
      - /bin/sh
      - -lc
      - |
        python -c "
        import psycopg2, os

        conn = psycopg2.connect(os.environ['DATABASE_URL'])
        cur = conn.cursor()
        cur.execute('CREATE TABLE IF NOT EXISTS results (id SERIAL, value FLOAT)')
        for i in range(100):
            cur.execute('INSERT INTO results (value) VALUES (%s)', (i * 1.5,))
        conn.commit()
        cur.execute('SELECT count(*), avg(value) FROM results')
        count, avg = cur.fetchone()
        print(f'Inserted {count} rows, average value: {avg:.2f}')
        conn.close()
        "
    x-runtime:
      prepare:
        commands:
          - pip install --no-cache-dir psycopg2-binary
    x-slurm:
      cpus_per_task: 2

Restart Policy

Source: examples/restart-policy.yaml

name: restart-policy

x-slurm:
  job_name: restart-policy
  time: "00:10:00"
  mem: 4G
  cpus_per_task: 1
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  flaky-worker:
    image: python:3.11-slim
    command:
      - /bin/sh
      - -lc
      - |
        python - <<'PY'
        import pathlib
        import sys
        import time

        state_dir = pathlib.Path("/hpc-compose/job/restart-policy")
        counter_path = state_dir / "attempts.txt"

        state_dir.mkdir(parents=True, exist_ok=True)
        attempts = int(counter_path.read_text()) if counter_path.exists() else 0
        attempts += 1
        counter_path.write_text(f"{attempts}\n")

        print(f"attempt {attempts}")
        if attempts <= 2:
            print("simulating transient failure")
            sys.exit(42)

        print("work completed after transient failures")
        time.sleep(1)
        PY
    x-slurm:
      failure_policy:
        mode: restart_on_failure
        max_restarts: 5
        backoff_seconds: 2
        window_seconds: 60
        max_restarts_in_window: 3

Training Checkpoints

Source: examples/training-checkpoints.yaml

name: training-checkpoints

x-slurm:
  job_name: training-checkpoints
  time: "04:00:00"
  mem: 64G
  cpus_per_task: 8
  gpus: 1
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  trainer:
    image: pytorch/pytorch:2.12.1-cuda13.2-cudnn9-runtime
    volumes:
      - /shared/$USER/checkpoints:/checkpoints
    environment:
      CHECKPOINT_DIR: /checkpoints
      NUM_EPOCHS: "10"
    command:
      - /bin/sh
      - -lc
      - |
        python -c "
        import os, torch

        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        print(f'Training on {device}')

        ckpt_dir = os.environ['CHECKPOINT_DIR']
        os.makedirs(ckpt_dir, exist_ok=True)

        model = torch.nn.Linear(128, 10).to(device)
        optimizer = torch.optim.Adam(model.parameters())
        data = torch.randn(256, 128, device=device)

        for epoch in range(int(os.environ['NUM_EPOCHS'])):
            out = model(data)
            loss = out.sum()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            path = os.path.join(ckpt_dir, f'checkpoint_epoch_{epoch}.pt')
            torch.save({'epoch': epoch, 'model': model.state_dict()}, path)
            print(f'Epoch {epoch}: loss={loss.item():.4f}, saved {path}')

        print('Training complete')
        "
    x-slurm:
      gpus: 1
      cpus_per_task: 4

Training Resume

Source: examples/training-resume.yaml

name: training-resume

x-slurm:
  job_name: training-resume
  time: "04:00:00"
  mem: 64G
  cpus_per_task: 8
  gpus: 1
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}
  resume:
    path: /shared/$USER/runs/training-resume
  artifacts:
    export_dir: ./results/${SLURM_JOB_ID}
    paths:
      - /hpc-compose/job/checkpoints/**

services:
  trainer:
    image: pytorch/pytorch:2.12.1-cuda13.2-cudnn9-runtime
    environment:
      NUM_EPOCHS: "10"
    command:
      - /bin/sh
      - -lc
      - |
        python - <<'PY'
        import json
        import os
        import pathlib
        import time

        resume_dir = pathlib.Path(os.environ["HPC_COMPOSE_RESUME_DIR"])
        attempt = os.environ["HPC_COMPOSE_ATTEMPT"]
        is_resume = os.environ["HPC_COMPOSE_IS_RESUME"] == "1"
        checkpoint_dir = pathlib.Path("/hpc-compose/job/checkpoints")
        latest_state_path = resume_dir / "latest.json"

        resume_dir.mkdir(parents=True, exist_ok=True)
        checkpoint_dir.mkdir(parents=True, exist_ok=True)

        start_epoch = 0
        if latest_state_path.exists():
            state = json.loads(latest_state_path.read_text())
            start_epoch = state["next_epoch"]
            print(f"Resuming run at epoch {start_epoch} (attempt {attempt})")
        else:
            print(f"Starting fresh run (attempt {attempt})")

        for epoch in range(start_epoch, int(os.environ["NUM_EPOCHS"])):
            state = {
                "completed_epoch": epoch,
                "next_epoch": epoch + 1,
                "attempt": int(attempt),
                "is_resume": is_resume,
            }
            latest_state_path.write_text(json.dumps(state, indent=2) + "\n")
            artifact_path = checkpoint_dir / f"checkpoint_epoch_{epoch}.json"
            artifact_path.write_text(json.dumps(state, indent=2) + "\n")
            print(f"Epoch {epoch}: wrote {artifact_path}")
            time.sleep(1)
        PY

Training Sweep

Source: examples/training-sweep.yaml

name: training-sweep

x-slurm:
  job_name: training-sweep
  time: "00:20:00"
  mem: 8G
  cpus_per_task: 2
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

sweep:
  parameters:
    lr: [0.001, 0.01, 0.1]
    batch_size: [32, 64]
  matrix: full

services:
  trainer:
    image: python:3.11-slim
    environment:
      LR: "${lr:-0.001}"
      BATCH_SIZE: "${batch_size:-32}"
      SWEEP_ID: "${HPC_COMPOSE_SWEEP_ID:-manual}"
      TRIAL_ID: "${HPC_COMPOSE_SWEEP_TRIAL:-manual}"
    command:
      - python
      - -c
      - |
        import os
        import random

        lr = float(os.environ["LR"])
        batch_size = int(os.environ["BATCH_SIZE"])
        random.seed(f"{lr}:{batch_size}")
        score = 0.8 + random.random() * 0.05

        print(f"sweep={os.environ['SWEEP_ID']} trial={os.environ['TRIAL_ID']}")
        print(f"lr={lr} batch_size={batch_size} score={score:.4f}")

Training Tensorboard

Source: examples/training-tensorboard.yaml

# GPU training with a live TensorBoard sidecar.
#
# The trainer writes TensorBoard event files to the in-job shared directory
# /hpc-compose/job/logs; the tensorboard sidecar serves them on port 6006 and
# is gated by an HTTP readiness probe. Reach it from your laptop with an SSH
# tunnel, e.g. `ssh -L 6006:<compute-node>:6006 <login-host>`, then open
# http://127.0.0.1:6006. The event files are exported as tracked artifacts.
name: training-tensorboard

x-slurm:
  job_name: training-tensorboard
  time: "01:00:00"
  mem: 32G
  cpus_per_task: 8
  gpus: 1
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}
  artifacts:
    export_dir: ./results/${SLURM_JOB_ID}
    paths:
      - /hpc-compose/job/logs/**

services:
  trainer:
    image: pytorch/pytorch:2.12.1-cuda13.2-cudnn9-runtime
    command:
      - /bin/sh
      - -lc
      - |
        set -eu
        mkdir -p /hpc-compose/job/logs
        python - <<'PY'
        import time
        from torch.utils.tensorboard import SummaryWriter

        writer = SummaryWriter("/hpc-compose/job/logs")
        for step in range(100):
            writer.add_scalar("loss", 1.0 / (step + 1), step)
            writer.flush()
            time.sleep(1)
        writer.close()
        PY
        touch /hpc-compose/job/request.done
    x-runtime:
      prepare:
        commands:
          - pip install --no-cache-dir tensorboard
    x-slurm:
      gpus: 1
      cpus_per_task: 4

  tensorboard:
    image: python:3.11-slim
    command:
      - /bin/sh
      - -lc
      - |
        set -eu
        mkdir -p /hpc-compose/job/logs
        tensorboard --logdir /hpc-compose/job/logs --host 0.0.0.0 --port 6006 &
        tb_pid=$$!
        while [ ! -f /hpc-compose/job/request.done ]; do
          if ! kill -0 "$$tb_pid" 2>/dev/null; then
            wait "$$tb_pid"
            exit $$?
          fi
          sleep 5
        done
        kill "$$tb_pid" 2>/dev/null || true
        wait "$$tb_pid" || true
    readiness:
      type: http
      url: http://127.0.0.1:6006
      timeout_seconds: 300
    x-runtime:
      prepare:
        commands:
          - pip install --no-cache-dir tensorboard
    x-slurm:
      cpus_per_task: 2

vLLM OpenAI

Source: examples/vllm-openai.yaml

name: vllm-openai

x-slurm:
  job_name: vllm-openai
  time: "01:00:00"
  mem: 64G
  cpus_per_task: 8
  gpus: 1
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  vllm:
    image: vllm/vllm-openai:latest
    environment:
      MODEL_NAME: facebook/opt-125m
    command:
      - /bin/sh
      - -lc
      - |
        set -eu
        rm -f /hpc-compose/job/request.done
        python -m vllm.entrypoints.openai.api_server \
          --model $$MODEL_NAME \
          --host 0.0.0.0 \
          --port 8000 &
        server_pid=$$!
        while [ ! -f /hpc-compose/job/request.done ]; do
          if ! kill -0 "$$server_pid" 2>/dev/null; then
            wait "$$server_pid"
            exit $$?
          fi
          sleep 1
        done
        kill "$$server_pid" 2>/dev/null || true
        wait "$$server_pid" || true
    readiness:
      type: log
      pattern: "Uvicorn running on"
      timeout_seconds: 300
    x-slurm:
      gpus: 1
      cpus_per_task: 4

  client:
    image: python:3.11-slim
    depends_on:
      vllm:
        condition: service_healthy
    environment:
      OPENAI_BASE_URL: http://127.0.0.1:8000/v1
      MODEL_NAME: facebook/opt-125m
    command:
      - /bin/sh
      - -lc
      - |
        set -eu
        python -c "
        import openai, os

        client = openai.OpenAI(
            base_url=os.environ['OPENAI_BASE_URL'],
            api_key='unused',
        )
        response = client.chat.completions.create(
            model=os.environ['MODEL_NAME'],
            messages=[
                {'role': 'system', 'content': 'You are a concise assistant.'},
                {'role': 'user', 'content': 'What is HPC in one sentence?'},
            ],
            max_tokens=64,
            temperature=0.2,
        )
        print(response.choices[0].message.content)
        "
        touch /hpc-compose/job/request.done
    x-runtime:
      prepare:
        commands:
          - pip install --no-cache-dir openai
    x-slurm:
      cpus_per_task: 2

vLLM UV Worker

Source: examples/vllm-uv-worker.yaml

name: vllm-uv-worker

x-slurm:
  job_name: vllm-uv-worker
  time: "01:00:00"
  mem: 64G
  cpus_per_task: 8
  gpus: 1
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  vllm:
    image: vllm/vllm-openai:latest
    environment:
      MODEL_NAME: facebook/opt-125m
    command:
      - /bin/sh
      - -lc
      - |
        set -eu
        rm -f /hpc-compose/job/request.done
        python -m vllm.entrypoints.openai.api_server \
          --model "$$MODEL_NAME" \
          --host 0.0.0.0 \
          --port 8000 &
        server_pid=$$!
        while [ ! -f /hpc-compose/job/request.done ]; do
          if ! kill -0 "$$server_pid" 2>/dev/null; then
            wait "$$server_pid"
            exit $$?
          fi
          sleep 1
        done
        kill "$$server_pid" 2>/dev/null || true
        wait "$$server_pid" || true
    readiness:
      type: log
      pattern: "Uvicorn running on"
      timeout_seconds: 300
    x-slurm:
      gpus: 1
      cpus_per_task: 4

  worker:
    image: python:3.11-slim
    working_dir: /workspace
    volumes:
      - ./vllm-uv-worker:/workspace
    depends_on:
      vllm:
        condition: service_healthy
    environment:
      OPENAI_BASE_URL: http://127.0.0.1:8000/v1
      MODEL_NAME: facebook/opt-125m
      REQUEST_DONE_PATH: /hpc-compose/job/request.done
    command:
      - /bin/sh
      - -lc
      - |
        set -eu
        UV_CACHE_DIR=/hpc-compose/job/.uv-cache uv run worker.py
    x-runtime:
      prepare:
        commands:
          - pip install --no-cache-dir uv
    x-slurm:
      cpus_per_task: 2

Eval Harness

Source: examples/eval-harness.yaml

name: eval-harness

x-slurm:
  job_name: eval-harness
  time: "01:00:00"
  mem: 64G
  cpus_per_task: 8
  gpus: 1
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}
  artifacts:
    export_dir: ./results/${SLURM_JOB_ID}
    paths:
      - /hpc-compose/job/results/**

# Sweep stub: benchmark a served model across models/tasks. The base spec stays
# runnable without sweep variables because the service env carries
# ${...:-default} fallbacks; `hpc-compose sweep submit` overrides model/tasks.
sweep:
  parameters:
    model: [facebook/opt-125m]
    tasks: [hellaswag]
  matrix: full

services:
  vllm:
    image: vllm/vllm-openai:latest
    environment:
      MODEL_NAME: "${model:-facebook/opt-125m}"
    command:
      - /bin/sh
      - -lc
      - |
        set -eu
        rm -f /hpc-compose/job/request.done
        python -m vllm.entrypoints.openai.api_server \
          --model "$$MODEL_NAME" \
          --host 0.0.0.0 \
          --port 8000 &
        server_pid=$$!
        while [ ! -f /hpc-compose/job/request.done ]; do
          if ! kill -0 "$$server_pid" 2>/dev/null; then
            wait "$$server_pid"
            exit $$?
          fi
          sleep 5
        done
        kill "$$server_pid" 2>/dev/null || true
        wait "$$server_pid" || true
    readiness:
      type: http
      url: http://127.0.0.1:8000/health
      timeout_seconds: 600
    x-slurm:
      gpus: 1
      cpus_per_task: 4

  client:
    image: python:3.11-slim
    depends_on:
      vllm:
        condition: service_healthy
    environment:
      OPENAI_BASE_URL: http://127.0.0.1:8000/v1
      MODEL_NAME: "${model:-facebook/opt-125m}"
      TASKS: "${tasks:-hellaswag}"
    command:
      - /bin/sh
      - -lc
      - |
        set -eu
        mkdir -p /hpc-compose/job/results
        lm_eval \
          --model local-completions \
          --model_args "base_url=$$OPENAI_BASE_URL/completions,model=$$MODEL_NAME,num_concurrent=4" \
          --tasks "$$TASKS" \
          --output_path /hpc-compose/job/results
        touch /hpc-compose/job/request.done
    x-runtime:
      prepare:
        commands:
          - pip install --no-cache-dir lm-eval
    x-slurm:
      cpus_per_task: 2

Cuda Probe

Source: examples/cuda-probe.yaml

name: cuda-probe

# Fast compute-node CUDA / GPU probe.
#
# No repo install, no uv, no model files: a tiny NVIDIA CUDA base image is
# imported and run as a one-shot Slurm job. (nvidia-smi is provided by the host
# driver and injected into the container at runtime by the enroot/pyxis NVIDIA
# hook — the CUDA base image itself does not bundle it.) It isolates
# "can the cluster give me a GPU?" from any framework/Python environment, so if
# this passes but a later PyTorch/JAX/TensorFlow job fails, the problem is in the
# framework image, not in Slurm/Pyxis/GPU allocation/driver visibility.
#
# Notes:
# - The base image's CUDA version is independent of the driver's reported CUDA;
#   nvidia-smi reports the *driver's* max CUDA, not the image toolkit version.
# - GPUs are only injected at Slurm/Pyxis runtime. Prepare-time image import runs
#   CPU-only on the login node (hpc-compose disables the enroot NVIDIA hook during
#   prepare), so importing this CUDA image does not need a driver.

runtime:
  backend: pyxis

x-slurm:
  job_name: cuda-probe
  time: "00:10:00"
  cpus_per_task: 2
  mem: 8G
  # Request one GPU. Some sites (e.g. HAICORE) prefer an explicit gres such as
  # `gres: gpu:1`; set partition/account via settings, --profile, or flags.
  gpus: 1
  cache_dir: ${CACHE_DIR:-/cluster/shared/hpc-compose-cache}

services:
  probe:
    image: nvidia/cuda:12.4.1-base-ubuntu22.04
    script: |
      set -eu
      echo "hostname=$(hostname)"
      echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"
      echo "SLURM_JOB_ID=${SLURM_JOB_ID:-}"
      echo "SLURM_JOB_NODELIST=${SLURM_JOB_NODELIST:-}"
      nvidia-smi
      nvidia-smi -L
      ls -l /dev/nvidia* 2>/dev/null || true

Keyboard shortcuts

hpc-compose