Qwen3.5直接运行

1、下载模型

 hf download unsloth/Qwen3.5-35B-A3B-GGUF Qwen3.5-35B-A3B-Q4_K_M.gguf   --local-dir /data/models/

2、安装运行环境

apt update
apt install -y git build-essential cmake ninja-build libssl-dev

3、安装nvcc

wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2604/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get -y install cuda-toolkit-13-3

4、克隆

git clone https://github.com/ggml-org/llama.cpp

5、编译

cmake -B build \
  -DGGML_CUDA=ON \
  -DCMAKE_BUILD_TYPE=Release \
  -DCUDAToolkit_ROOT=/usr/local/cuda-13.3 \
  -DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.3/bin/nvcc

cmake --build build -j

6、运行

./build/bin/llama-server \
-m /data/models/Qwen3.5-35B-A3B-Q4_K_M.gguf \
-ngl 999 \
-c 16384 \
--host 0.0.0.0 \
--port 8000