量化
git clone https://github.com/Tencent/AngelSlim.git
cd AngelSlim
git checkout remotes/origin/release/0.3
pip install -e .
python3 tools/run.py -c configs/qwen3/int4_gptq/qwen3-14b_int4_gptq.yaml
运行
pip install vllm
python -m vllm.entrypoints.openai.api_server \
--model ./output/qwen3-14b_int4_gptq \
--served-model-name qwen3-14b-int4-gptq \
--api-key your-secure-api-key \
--tensor-parallel-size 1 \
--gpu-memory-utilization 0.85 \
--max-model-len 8192 \
--port 8000
测试
安装信赖
pip install openai
代码
#vim example_local_qwen
from __future__ import annotations
import os
import sys
try:
from openai import OpenAI
except ImportError:
print("请先安装: pip install openai", file=sys.stderr)
raise SystemExit(1)
# 与 vLLM 启动参数一致(可用环境变量覆盖)
BASE_URL = os.environ.get("OPENAI_BASE_URL", "http://127.0.0.1:8000/v1")
API_KEY = os.environ.get("OPENAI_API_KEY", "your-secure-api-key")
MODEL = os.environ.get("VLLM_MODEL", "qwen3-14b-int4-gptq")
def main() -> None:
client = OpenAI(base_url=BASE_URL, api_key=API_KEY)
resp = client.chat.completions.create(
model=MODEL,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "用一句话介绍你自己。"},
],
max_tokens=256,
temperature=0.7,
)
msg = resp.choices[0].message
print(msg.content or "")
if __name__ == "__main__":
main()
运行
python example_local_qwen.py