...

2025-09-13 05:49:07 +02:00
parent 581fb0c0f0
commit dea3b0ec7f
6 changed files with 294 additions and 12 deletions
--- a/examples/readme.md
+++ b/examples/readme.md
@@ -0,0 +1,72 @@
+[](https://docs.vllm.ai/projects/recipes/en/latest/Ernie/Ernie4.5.html)
+
+
+vllm bench throughput   --model baidu/ERNIE-4.5-21B-A3B-PT   --dataset-name random   --input-len 8000   --output-len 1000   --num-prompts 16
+
+
+
+vllm bench throughput \
+  --model baidu/ERNIE-4.5-21B-A3B-PT \
+  --dataset-name random \
+  --input-len 9000 \
+  --output-len 4000 \
+  --num-prompts 5 \
+  --max-model-len 13000 \
+  --gpu-memory-utilization 0.7
+
+
+vllm bench throughput \
+  --model Qwen/Qwen3-30B-A3B-FP8 \
+  --dataset-name random \
+  --input-len 9000 \
+  --output-len 6000 \
+  --num-prompts 4 \
+  --max-model-len 15000 \
+  --gpu-memory-utilization 0.7
+
+
+vllm bench throughput \
+  --model Qwen/Qwen3-30B-A3B-FP8 \
+  --dataset-name random \
+  --input-len 9000 \
+  --output-len 6000 \
+  --num-prompts 10 \
+  --max-model-len 15000 \
+  --gpu-memory-utilization 0.7 \
+  --swap-space 256
+
+
+  vllm bench throughput \
+  --model nvidia/NVIDIA-Nemotron-Nano-9B-v2 \
+  --dataset-name random \
+  --input-len 9000 \
+  --output-len 6000 \
+  --num-prompts 2 \
+  --max-model-len 15000 \
+  --gpu-memory-utilization 0.7 \
+  --trust-remote-code \
+  --max-num-seqs 64 \
+  --mamba-ssm-cache-dtype float16
+
+
+
+
+#IS A GOOD ONE:
+vllm bench throughput \
+  --model Qwen/Qwen3-30B-A3B-FP8 \
+  --dataset-name random \
+  --input-len 12000 \
+  --output-len 6000 \
+  --num-prompts 12 \
+  --max-model-len 20000 \
+  --gpu-memory-utilization 0.7 \
+  --kv-cache-dtype fp8 \
+  --swap-space 128
+
+//be careful with swap space, it can make it slower if too high, PCI bandwidth is limited
+
+
+
+
+VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 
+vllm bench throughput   --model openai/gpt-oss-20b   --dataset-name random   --input-len 12000   --output-len 6000   --num-prompts 8   --max-model-len 20000   --gpu-memory-utilization 0.7   --kv-cache-dtype fp8