1、下载模型
hf download unsloth/Qwen3.5-35B-A3B-GGUF Qwen3.5-35B-A3B-Q4_K_M.gguf --local-dir /data/models/
2、安装运行环境
apt update
apt install -y git build-essential cmake ninja-build libssl-dev
3、安装nvcc
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2604/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get -y install cuda-toolkit-13-3
4、克隆
git clone https://github.com/ggml-org/llama.cpp
5、编译
cmake -B build \
-DGGML_CUDA=ON \
-DCMAKE_BUILD_TYPE=Release \
-DCUDAToolkit_ROOT=/usr/local/cuda-13.3 \
-DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.3/bin/nvcc
cmake --build build -j
6、运行
./build/bin/llama-server \
-m /data/models/Qwen3.5-35B-A3B-Q4_K_M.gguf \
-ngl 999 \
-c 16384 \
--host 0.0.0.0 \
--port 8000