Intel/Qwen3.6-35B-A3B-int4-AutoRound
vllmIntel AutoRound INT4 target + z-lab/Qwen3.6-35B-A3B-DFlash, k=8, long-context Spark Arena profile
Quick Info
MODEL
Intel/Qwen3.6-35B-A3B-int4-AutoRound
RUNTIME
vllm
TENSOR PARALLEL
1
NODES
1
AUTHOR
V
Valentin BONNEAU
Recipe YAML
description: Intel AutoRound INT4 target + z-lab/Qwen3.6-35B-A3B-DFlash, k=8, long-context Spark Arena profile
model: Intel/Qwen3.6-35B-A3B-int4-AutoRound
container: vllm-node
mods:
- mods/fix-qwen3.5-autoround
- mods/fix-qwen3-coder-next
defaults:
port: 8000
host: 0.0.0.0
gpu_memory_utilization: 0.8
max_model_len: 131072
max_num_batched_tokens: 32768
max_num_seqs: 4
env:
VLLM_MARLIN_USE_ATOMIC_ADD: '1'
HF_HUB_OFFLINE: '1'
command: |
vllm serve Intel/Qwen3.6-35B-A3B-int4-AutoRound \
--host {host} \
--port {port} \
--language-model-only \
--max-model-len {max_model_len} \
--max-num-batched-tokens {max_num_batched_tokens} \
--max-num-seqs {max_num_seqs} \
--gpu-memory-utilization {gpu_memory_utilization} \
--load-format fastsafetensors \
--attention-backend flash_attn \
--enable-prefix-caching \
--enable-chunked-prefill \
--trust-remote-code \
--quantization gptq \
--reasoning-parser qwen3 \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--default-chat-template-kwargs '{{"preserve_thinking":true}}' \
--speculative-config '{{"method":"dflash","model":"z-lab/Qwen3.6-35B-A3B-DFlash","num_speculative_tokens":{num_speculative_tokens}}}'
recipe_version: '1'
name: Qwen3.6-35B-A3B-AutoRound-DFlash-k8-solo-spark-arena-long
cluster_only: false
solo_only: true