diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 025b07b75..c159f8a8a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -109,7 +109,7 @@ jobs:
       matrix:
         # On PRs only build one representative arch (packaging logic is identical
         # across all sm_*); build the full matrix on schedule/dispatch.
-        sm: ${{ github.event_name == 'pull_request' && fromJSON('["sm_89"]') || fromJSON('["sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120"]') }}
+        sm: ${{ github.event_name == 'pull_request' && fromJSON('["sm_89"]') || fromJSON('["sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"]') }}
 
     steps:
       - name: Clone
@@ -138,7 +138,7 @@ jobs:
           wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
           sudo dpkg -i cuda-keyring_1.1-1_all.deb
           sudo apt-get update
-          sudo apt-get install -y cuda-toolkit-12-8 cmake ninja-build patchelf
+          sudo apt-get install -y cuda-toolkit-12-9 cmake ninja-build patchelf
 
       - name: Set CUDA environment
         run: |
@@ -210,7 +210,7 @@ jobs:
       matrix:
         # On PRs only build one representative arch (packaging logic is identical
         # across all sm_*); build the full matrix on schedule/dispatch.
-        sm: ${{ github.event_name == 'pull_request' && fromJSON('["sm_89"]') || fromJSON('["sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120"]') }}
+        sm: ${{ github.event_name == 'pull_request' && fromJSON('["sm_89"]') || fromJSON('["sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"]') }}
 
     steps:
       - name: Clone
@@ -223,9 +223,9 @@ jobs:
 
       - name: Install CUDA Toolkit
         id: cuda-toolkit
-        uses: Jimver/cuda-toolkit@v0.2.22
+        uses: Jimver/cuda-toolkit@v0.2.35
         with:
-          cuda: '12.8.0'
+          cuda: '12.9.1'
           method: 'network'
           sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "curand", "nvjitlink", "thrust", "visual_studio_integration"]'
 
@@ -291,6 +291,98 @@ jobs:
           name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-windows-cuda-${{ matrix.sm }}-x64.zip
           path: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-windows-cuda-${{ matrix.sm }}-x64.zip
 
+  ubuntu-arm64-cuda:
+    # GB10 (sm_121) ARM64 Linux build
+    runs-on: ubuntu-24.04-arm
+
+    strategy:
+      fail-fast: false
+      matrix:
+        # sm_121 targets the GB10 (Blackwell) ARM64 device
+        sm: ["sm_121"]
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+        with:
+          submodules: recursive
+          fetch-depth: 0
+          repository: 'leejet/stable-diffusion.cpp'
+          ref: master
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ubuntu-arm64-cuda-${{ matrix.sm }}
+          evict-old-files: 1d
+
+      - name: Install CUDA Toolkit
+        run: |
+          wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/cuda-keyring_1.1-1_all.deb
+          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+          sudo apt-get update
+          sudo apt-get install -y cuda-toolkit-12-9 cmake ninja-build patchelf
+
+      - name: Set CUDA environment
+        run: |
+          echo "CUDA_PATH=/usr/local/cuda" >> "$GITHUB_ENV"
+          echo "/usr/local/cuda/bin" >> "$GITHUB_PATH"
+          echo "LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-}" >> "$GITHUB_ENV"
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake_arch="${{ matrix.sm }}"
+          cmake_arch="${cmake_arch#sm_}"
+          cmake -B build -S . \
+            -DSD_CUBLAS=ON \
+            -DGGML_CUDA=ON \
+            -DCMAKE_CUDA_ARCHITECTURES="${cmake_arch}" \
+            -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
+            -DGGML_NATIVE=OFF \
+            -DSD_BUILD_SHARED_LIBS=ON \
+            -DCMAKE_BUILD_TYPE=Release
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Bundle CUDA runtime libraries
+        run: |
+          cuda_lib=/usr/local/cuda/lib64
+          cp -av ${cuda_lib}/libcudart.so*    build/bin/
+          cp -av ${cuda_lib}/libcublas.so*    build/bin/
+          cp -av ${cuda_lib}/libcublasLt.so*  build/bin/
+          cp -av ${cuda_lib}/libcurand.so*    build/bin/
+          cp -av ${cuda_lib}/libnvJitLink.so* build/bin/
+          find build -name 'libggml*.so*' ! -path 'build/bin/*' -exec cp -av {} build/bin/ \;
+
+      - name: Set RPATH for portable distribution
+        run: |
+          for f in build/bin/*; do
+            [ -f "$f" ] && ! [ -L "$f" ] || continue
+            if file "$f" | grep -q 'ELF'; then
+              patchelf --set-rpath '$ORIGIN' "$f"
+            fi
+          done
+
+      - name: Get commit hash
+        id: commit
+        if: ${{ github.event_name == 'schedule' || github.event.inputs.create_release == 'true' }}
+        uses: prompt/actions-commit-hash@v2
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ github.event_name == 'schedule' || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp ggml/LICENSE ./build/bin/ggml.txt
+          cp LICENSE ./build/bin/stable-diffusion.cpp.txt
+          tar -cJf sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-ubuntu-cuda-${{ matrix.sm }}-arm64.tar.xz -C ./build/bin .
+
+      - name: Upload artifacts
+        if: ${{ github.event_name == 'schedule' || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-ubuntu-cuda-${{ matrix.sm }}-arm64.tar.xz
+          path: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-ubuntu-cuda-${{ matrix.sm }}-arm64.tar.xz
+
   windows-latest-cmake:
     runs-on: windows-2022
 
@@ -840,6 +932,7 @@ jobs:
       - ubuntu-latest-rocm
       - ubuntu-latest-cmake
       - ubuntu-latest-cuda
+      - ubuntu-arm64-cuda
       - windows-latest-cmake-hip
       - windows-latest-rocm
       - windows-latest-cmake