// https://github.com/lawmurray/gpu-gemm/blob/main/src/gemm.cu template<int M, int N> requires (M == N || M == N/2) __device__ point2 hilbert2(constint s){ int i = 0, j = 0; int t = s; for (int k = 1; k < max(M, N); k *= 2) { int bi = 1 & (t/2); // local gray code, u shape top left to bottom left int bj = 1 & (t ^ bi); if (bj == 0) { if (bi == 1) { i = k - 1 - i; // flip up-down j = k - 1 - j; // flip left-right } int tmp = i; // transpose i = j; j = tmp; } i += k*bi; j += k*bj; t /= 4; } return {i, j}; }