This commit is contained in:
2025-09-13 05:49:07 +02:00
parent 581fb0c0f0
commit dea3b0ec7f
6 changed files with 294 additions and 12 deletions

26
examples/install.sh Normal file
View File

@@ -0,0 +1,26 @@
cd /root
curl -LsSf https://astral.sh/uv/install.sh | sh
uv venv
apt update
apt install mc curl htop git -y
apt install -y build-essential python3.12-dev
source /root/.local/bin/env
export PATH=/root/.venv/bin/:$PATH
VLLM_ATTENTION_BACKEND=FLASHINFER
uv pip install --upgrade pip setuptools wheel ninja
uv pip install --upgrade tiktoken ipython numpy psutil
# uv pip install --pre torch==2.9.0.dev20250804+cu128 --index-url https://download.pytorch.org/whl/nightly/cu128
uv pip install vllm --torch-backend=auto
uv pip install flash-attn --no-build-isolation
uv pip install triton
uv pip install xformers
git clone https://github.com/flashinfer-ai/flashinfer.git
cd flashinfer
pip install .
# uv pip install --upgrade vllm --torch-backend=auto
# uv pip install --upgrade flash-attn --no-build-isolation

72
examples/readme.md Normal file
View File

@@ -0,0 +1,72 @@
[](https://docs.vllm.ai/projects/recipes/en/latest/Ernie/Ernie4.5.html)
vllm bench throughput --model baidu/ERNIE-4.5-21B-A3B-PT --dataset-name random --input-len 8000 --output-len 1000 --num-prompts 16
vllm bench throughput \
--model baidu/ERNIE-4.5-21B-A3B-PT \
--dataset-name random \
--input-len 9000 \
--output-len 4000 \
--num-prompts 5 \
--max-model-len 13000 \
--gpu-memory-utilization 0.7
vllm bench throughput \
--model Qwen/Qwen3-30B-A3B-FP8 \
--dataset-name random \
--input-len 9000 \
--output-len 6000 \
--num-prompts 4 \
--max-model-len 15000 \
--gpu-memory-utilization 0.7
vllm bench throughput \
--model Qwen/Qwen3-30B-A3B-FP8 \
--dataset-name random \
--input-len 9000 \
--output-len 6000 \
--num-prompts 10 \
--max-model-len 15000 \
--gpu-memory-utilization 0.7 \
--swap-space 256
vllm bench throughput \
--model nvidia/NVIDIA-Nemotron-Nano-9B-v2 \
--dataset-name random \
--input-len 9000 \
--output-len 6000 \
--num-prompts 2 \
--max-model-len 15000 \
--gpu-memory-utilization 0.7 \
--trust-remote-code \
--max-num-seqs 64 \
--mamba-ssm-cache-dtype float16
#IS A GOOD ONE:
vllm bench throughput \
--model Qwen/Qwen3-30B-A3B-FP8 \
--dataset-name random \
--input-len 12000 \
--output-len 6000 \
--num-prompts 12 \
--max-model-len 20000 \
--gpu-memory-utilization 0.7 \
--kv-cache-dtype fp8 \
--swap-space 128
//be careful with swap space, it can make it slower if too high, PCI bandwidth is limited
VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1
vllm bench throughput --model openai/gpt-oss-20b --dataset-name random --input-len 12000 --output-len 6000 --num-prompts 8 --max-model-len 20000 --gpu-memory-utilization 0.7 --kv-cache-dtype fp8

View File

@@ -1,3 +1,3 @@
export SSH_SERVER=108.5.176.71
export SSH_PORT=10200
export SSH_SERVER=38.79.155.162
export SSH_PORT=61092

View File

@@ -4,11 +4,14 @@ set -euo pipefail
source source ../../functions/base.sh
curl -LsSf https://astral.sh/uv/install.sh | sh
source $HOME/.local/bin/env
# vllm serve openai/gpt-oss-20b
# vllm serve openai/gpt-oss-20b --tensor-parallel-size 2
# vllm serve openai/gpt-oss-20b --tensor-parallel-size 8
# For 120B
# vllm serve openai/gpt-oss-120b
# vllm serve openai/gpt-oss-120b --tensor-parallel-size 8
mark_done

View File

@@ -13,8 +13,13 @@ exit 0
touch "$0.done"
exit 0
uv pip install --pre torch==2.9.0.dev20250804+cu128 \
--index-url https://download.pytorch.org/whl/nightly/cu128
cd /root
uv venv
source .venv/bin/activate
uv pip install --pre torch==2.9.0.dev20250804+cu129 \
--index-url https://download.pytorch.org/whl/nightly/cu129
uv pip install tiktoken ipython numpy psutil
@@ -22,7 +27,6 @@ uv pip install tiktoken ipython numpy psutil
python -c "import torch; print(torch.__version__, torch.version.cuda)"
# 2.9.0.dev20250804+cu128 12.8
cd /root
source .venv/bin/activate
uv pip install --upgrade pip setuptools wheel ninja
export MAX_JOBS=8
@@ -31,12 +35,13 @@ export NCCL_P2P_DISABLE=0
export NCCL_DEBUG=INFO
export CUDA_DEVICE_MAX_CONNECTIONS=1
pip install flash-attn --no-build-isolation
uv pip install vllm --torch-backend=auto
uv pip install flash-attn --no-build-isolation
uv pip install --pre vllm==0.10.1+gptoss \
--extra-index-url https://wheels.vllm.ai/gpt-oss/ \
--extra-index-url https://download.pytorch.org/whl/nightly/cu128 \
--index-strategy unsafe-best-match
# uv pip install --pre vllm==0.10.1+gptoss \
# --extra-index-url https://wheels.vllm.ai/gpt-oss/ \
# --extra-index-url https://download.pytorch.org/whl/nightly/cu128 \
# --index-strategy unsafe-best-match
mark_done