nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4
vllmvLLM serving Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4 on a single GB10
Quick Info
MODEL
nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4
RUNTIME
vllm
TENSOR PARALLEL
1
NODES
1
AUTHOR
S
Seth Hobson
Recipe YAML
description: vLLM serving Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4 on a single GB10
model: nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4
container: vllm/vllm-openai:v0.20.0-aarch64-cu130-ubuntu2404
mods:
- mods/nemotron-omni
defaults:
port: 8000
host: 0.0.0.0
tensor_parallel: 1
gpu_memory_utilization: 0.8
max_model_len: 131072
max_num_batched_tokens: 32768
max_num_seqs: 8
env:
VLLM_NVFP4_GEMM_BACKEND: marlin
VLLM_MARLIN_USE_ATOMIC_ADD: '1'
VLLM_USE_FLASHINFER_MOE_FP4: '0'
VLLM_ALLOW_LONG_MAX_MODEL_LEN: '1'
CUDA_MANAGED_FORCE_DEVICE_ALLOC: '1'
PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
OMP_NUM_THREADS: '4'
command: |
vllm serve nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4 \
--served-model-name nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-NVFP4 nemotron-3-nano-omni \
--tensor-parallel-size {tensor_parallel} \
--port {port} --host {host} \
--max-model-len {max_model_len} \
--max-num-seqs {max_num_seqs} \
--max-num-batched-tokens {max_num_batched_tokens} \
--gpu-memory-utilization {gpu_memory_utilization} \
--quantization fp4 \
--moe-backend marlin \
--kv-cache-dtype fp8 \
--mamba-ssm-cache-dtype float32 \
--enable-prefix-caching \
--reasoning-parser nemotron_v3 \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--video-pruning-rate 0.5 \
--limit-mm-per-prompt '{{"video":1,"image":1,"audio":1}}' \
--media-io-kwargs '{{"video":{{"fps":2,"num_frames":256}}}}' \
--allowed-local-media-path / \
--trust-remote-code
recipe_version: '1'
name: Nemotron-3-Nano-Omni-NVFP4
cluster_only: false
solo_only: true