Files
herolib_python/examples/readme.md
2025-09-13 05:49:07 +02:00

1.7 KiB

vllm bench throughput --model baidu/ERNIE-4.5-21B-A3B-PT --dataset-name random --input-len 8000 --output-len 1000 --num-prompts 16

vllm bench throughput
--model baidu/ERNIE-4.5-21B-A3B-PT
--dataset-name random
--input-len 9000
--output-len 4000
--num-prompts 5
--max-model-len 13000
--gpu-memory-utilization 0.7

vllm bench throughput
--model Qwen/Qwen3-30B-A3B-FP8
--dataset-name random
--input-len 9000
--output-len 6000
--num-prompts 4
--max-model-len 15000
--gpu-memory-utilization 0.7

vllm bench throughput
--model Qwen/Qwen3-30B-A3B-FP8
--dataset-name random
--input-len 9000
--output-len 6000
--num-prompts 10
--max-model-len 15000
--gpu-memory-utilization 0.7
--swap-space 256

vllm bench throughput
--model nvidia/NVIDIA-Nemotron-Nano-9B-v2
--dataset-name random
--input-len 9000
--output-len 6000
--num-prompts 2
--max-model-len 15000
--gpu-memory-utilization 0.7
--trust-remote-code
--max-num-seqs 64
--mamba-ssm-cache-dtype float16

#IS A GOOD ONE: vllm bench throughput
--model Qwen/Qwen3-30B-A3B-FP8
--dataset-name random
--input-len 12000
--output-len 6000
--num-prompts 12
--max-model-len 20000
--gpu-memory-utilization 0.7
--kv-cache-dtype fp8
--swap-space 128

//be careful with swap space, it can make it slower if too high, PCI bandwidth is limited

VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 vllm bench throughput --model openai/gpt-oss-20b --dataset-name random --input-len 12000 --output-len 6000 --num-prompts 8 --max-model-len 20000 --gpu-memory-utilization 0.7 --kv-cache-dtype fp8