[](https://docs.vllm.ai/projects/recipes/en/latest/Ernie/Ernie4.5.html) vllm bench throughput --model baidu/ERNIE-4.5-21B-A3B-PT --dataset-name random --input-len 8000 --output-len 1000 --num-prompts 16 vllm bench throughput \ --model baidu/ERNIE-4.5-21B-A3B-PT \ --dataset-name random \ --input-len 9000 \ --output-len 4000 \ --num-prompts 5 \ --max-model-len 13000 \ --gpu-memory-utilization 0.7 vllm bench throughput \ --model Qwen/Qwen3-30B-A3B-FP8 \ --dataset-name random \ --input-len 9000 \ --output-len 6000 \ --num-prompts 4 \ --max-model-len 15000 \ --gpu-memory-utilization 0.7 vllm bench throughput \ --model Qwen/Qwen3-30B-A3B-FP8 \ --dataset-name random \ --input-len 9000 \ --output-len 6000 \ --num-prompts 10 \ --max-model-len 15000 \ --gpu-memory-utilization 0.7 \ --swap-space 256 vllm bench throughput \ --model nvidia/NVIDIA-Nemotron-Nano-9B-v2 \ --dataset-name random \ --input-len 9000 \ --output-len 6000 \ --num-prompts 2 \ --max-model-len 15000 \ --gpu-memory-utilization 0.7 \ --trust-remote-code \ --max-num-seqs 64 \ --mamba-ssm-cache-dtype float16 #IS A GOOD ONE: vllm bench throughput \ --model Qwen/Qwen3-30B-A3B-FP8 \ --dataset-name random \ --input-len 12000 \ --output-len 6000 \ --num-prompts 12 \ --max-model-len 20000 \ --gpu-memory-utilization 0.7 \ --kv-cache-dtype fp8 \ --swap-space 128 //be careful with swap space, it can make it slower if too high, PCI bandwidth is limited VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 vllm bench throughput --model openai/gpt-oss-20b --dataset-name random --input-len 12000 --output-len 6000 --num-prompts 8 --max-model-len 20000 --gpu-memory-utilization 0.7 --kv-cache-dtype fp8