git clone https://github.com/ggml-org/llama.cpp
lspci | grep -i nvidia
nvidia-smi
sudo pacman -Syu
sudo pacman -S --needed cuda base-devel cmake git
gcc --version && cmake --version && git --version
git clone https://github.com/ggml-org/llama.cpp
echo $SHELL cd llama.cpp/
cmake -B build \ -DGGML_CUDA=ON \ -DGGML_CUDA_FA_ALL_QUANTS=ON \ -DCMAKE_BUILD_TYPE=Release
cmake --build build --config Release -j$(nproc) git pull
./build/bin/llama-server \ -m ~/.lmstudio/models/lmstudio-community/Qwen3.6-35B-A3B-GGUF/Qwen3.6-35B-A3B-Q4_K_M.gguf \ --mmproj ~/.lmstudio/models/lmstudio-community/Qwen3.6-35B-A3B-GGUF/mmproj-Qwen3.6-35B-A3B-BF16.gguf \ -ngl 999 \ -ncmoe 21 \ --no-mmap \ --port 8080 \ --host 0.0.0.0 \ --flash-attn auto \ -c 131072 \ --cache-type-k q8_0 \ --cache-type-v q8_0 \ --jinja \ --image-max-tokens 560 \ --image-min-tokens 1120 \ --image-max-tokens 1120 \ --temp 0.7 \ --repeat-penalty 1.05 \ --min-p 0.05 \ --mlock
cmoe 21 \ semble ok avec 32gb et 16 gb gpu