From 6e8fcb0f35a60893c0e3700c57c2fc028b76c91a Mon Sep 17 00:00:00 2001 From: Jianhui Dai Date: Tue, 14 Jan 2025 13:50:12 +0800 Subject: [PATCH] [webgpu] Restore MatMulNBits workgroup size for Phi-3.5 This change restores the MatMulNBits workgroup size from (8, 8, 1) back to (16, 8, 1) to resolve a performance regression observed on Intel iGPUs during token generation (M=1). Signed-off-by: Jianhui Dai --- onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc index 28e4ccec09b32..018a7e5ac2675 100644 --- a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc +++ b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc @@ -583,7 +583,8 @@ Status MatMulNBits::ComputeInternal(onnxruntime::webgpu::ComputeContext& context program.CacheHint("T_M" + std::to_string(tile_m) + "Subgroup" + std::to_string(use_subgroup)); } else if (block_size == 32) { components = 1; - constexpr uint32_t workgroup_size = 64; + // TODO: Tune the workgroup size when `M=1`. + constexpr uint32_t workgroup_size = 128; const uint32_t workgroup_y = N % 8 == 0 ? 8 : 1; const uint32_t workgroup_x = workgroup_size / workgroup_y; program.SetWorkgroupSize(workgroup_x, workgroup_y, 1);