...
This commit is contained in:
72
examples/readme.md
Normal file
72
examples/readme.md
Normal file
@@ -0,0 +1,72 @@
|
||||
[](https://docs.vllm.ai/projects/recipes/en/latest/Ernie/Ernie4.5.html)
|
||||
|
||||
|
||||
vllm bench throughput --model baidu/ERNIE-4.5-21B-A3B-PT --dataset-name random --input-len 8000 --output-len 1000 --num-prompts 16
|
||||
|
||||
|
||||
|
||||
vllm bench throughput \
|
||||
--model baidu/ERNIE-4.5-21B-A3B-PT \
|
||||
--dataset-name random \
|
||||
--input-len 9000 \
|
||||
--output-len 4000 \
|
||||
--num-prompts 5 \
|
||||
--max-model-len 13000 \
|
||||
--gpu-memory-utilization 0.7
|
||||
|
||||
|
||||
vllm bench throughput \
|
||||
--model Qwen/Qwen3-30B-A3B-FP8 \
|
||||
--dataset-name random \
|
||||
--input-len 9000 \
|
||||
--output-len 6000 \
|
||||
--num-prompts 4 \
|
||||
--max-model-len 15000 \
|
||||
--gpu-memory-utilization 0.7
|
||||
|
||||
|
||||
vllm bench throughput \
|
||||
--model Qwen/Qwen3-30B-A3B-FP8 \
|
||||
--dataset-name random \
|
||||
--input-len 9000 \
|
||||
--output-len 6000 \
|
||||
--num-prompts 10 \
|
||||
--max-model-len 15000 \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--swap-space 256
|
||||
|
||||
|
||||
vllm bench throughput \
|
||||
--model nvidia/NVIDIA-Nemotron-Nano-9B-v2 \
|
||||
--dataset-name random \
|
||||
--input-len 9000 \
|
||||
--output-len 6000 \
|
||||
--num-prompts 2 \
|
||||
--max-model-len 15000 \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--trust-remote-code \
|
||||
--max-num-seqs 64 \
|
||||
--mamba-ssm-cache-dtype float16
|
||||
|
||||
|
||||
|
||||
|
||||
#IS A GOOD ONE:
|
||||
vllm bench throughput \
|
||||
--model Qwen/Qwen3-30B-A3B-FP8 \
|
||||
--dataset-name random \
|
||||
--input-len 12000 \
|
||||
--output-len 6000 \
|
||||
--num-prompts 12 \
|
||||
--max-model-len 20000 \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--kv-cache-dtype fp8 \
|
||||
--swap-space 128
|
||||
|
||||
//be careful with swap space, it can make it slower if too high, PCI bandwidth is limited
|
||||
|
||||
|
||||
|
||||
|
||||
VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1
|
||||
vllm bench throughput --model openai/gpt-oss-20b --dataset-name random --input-len 12000 --output-len 6000 --num-prompts 8 --max-model-len 20000 --gpu-memory-utilization 0.7 --kv-cache-dtype fp8
|
Reference in New Issue
Block a user