Skip to content

Commit

Permalink
[LLVMGPU] Explicitly set configs for vector distribution pipeline low…
Browse files Browse the repository at this point in the history
…ering tests (#18553)

This patch does two things:

- Before this patch, tests in pipeline_vector_distribute.mlir run
select-lowering-config as well as the lowering pipeline together. This
restricts us from testing lowering for different configs for the same
kernel. This patch explicitly sets configuration for kernels in these
tests. For configuration, we already have tests in
config_vector_distribute.mlir which test kernel config logic for vector
distribution.
- Tests for gfx940 and gfx1100 were in the same file. Since we set
kernel configuration explicitly, running a test with a gfx1100 intrinsic
with gfx940 test target chip would cause errors. This patch splits these
tests into their own files.
  • Loading branch information
Groverkss committed Sep 19, 2024
1 parent 04144f6 commit fa44a32
Show file tree
Hide file tree
Showing 4 changed files with 174 additions and 210 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ iree_lit_test_suite(
"config_user_vector_distribute.mlir",
"lowering_scalar_dispatch.mlir",
"pipeline_tile_and_fuse.mlir",
"pipeline_vector_distribute.mlir",
"pipeline_vector_distribute_gfx940.mlir",
"pipeline_vector_distribute_gfx1100.mlir",
"pipeline_warp_reduction.mlir",
],
include = ["*.mlir"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ iree_lit_test_suite(
"config_vector_distribute.mlir"
"lowering_scalar_dispatch.mlir"
"pipeline_tile_and_fuse.mlir"
"pipeline_vector_distribute.mlir"
"pipeline_vector_distribute_gfx1100.mlir"
"pipeline_vector_distribute_gfx940.mlir"
"pipeline_warp_reduction.mlir"
TOOLS
FileCheck
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 \
// RUN: --iree-codegen-llvmgpu-use-vector-distribution --iree-llvmgpu-enable-prefetch=true \
// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-llvmgpu-lower-executable-target)))))" \
// RUN: %s | FileCheck %s

#config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 128]]>
#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>, mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>}>

#pipeline_layout = #hal.pipeline.layout<bindings = [
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>
]>
hal.executable @matmul_256x256x256_f16_f32 {
hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
hal.executable.export @matmul_256x256x256_f16_f32 layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index):
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @matmul_256x256x256_f16_f32() attributes {translation_info = #translation} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf32>
%6 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%5 : tensor<256x256xf32>) -> tensor<256x256xf32>
%7 = linalg.matmul {lowering_config = #config} ins(%3, %4 : tensor<256x256xf16>, tensor<256x256xf16>) outs(%6 : tensor<256x256xf32>) -> tensor<256x256xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf32>>
return
}
}
}
}

// CHECK-LABEL: func.func @matmul_256x256x256_f16_f32
// CHECK: scf.for {{.*}} = %c0 to %c256 step %c128 iter_args({{.*}}) -> (vector<2x2x8x1x1x1xf32>)
// Each subgroup handles 2 * 2 tiles, and for each tile we accumulate 8 times
// along the K dimension. So in total 32 wmma ops.
// CHECK-COUNT-32: amdgpu.wmma {{.*}} : vector<16xf16>, vector<16xf16>, vector<8xf32>
// CHECK: scf.yield %{{.+}} : vector<2x2x8x1x1x1xf32>
// Since each subgroup handles 2 * 2 tiles, and for each tile, each lane holds 4 values.
// we will have 32 writes. We cannot do contiguous writes since the outputs columns has interleaved
// thread ids.
// CHECK-COUNT-32: vector.transfer_write {{.+}} {in_bounds = [true, true]} : vector<1x1xf32>, memref<256x256xf32, #hal.descriptor_type<storage_buffer>>

// -----

#config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 128]]>
#translation = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false>, mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>}>

#pipeline_layout = #hal.pipeline.layout<bindings = [
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>
]>
hal.executable @matmul_256x256x256_f16_f16 {
hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
hal.executable.export @matmul_256x256x256_f16_f16 layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index):
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @matmul_256x256x256_f16_f16() attributes {translation_info = #translation} {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<256x256xf16>>
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<256x256xf16>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x256xf16>> -> tensor<256x256xf16>
%5 = tensor.empty() : tensor<256x256xf16>
%6 = linalg.fill {lowering_config = #config} ins(%cst : f16) outs(%5 : tensor<256x256xf16>) -> tensor<256x256xf16>
%7 = linalg.matmul {lowering_config = #config} ins(%3, %4 : tensor<256x256xf16>, tensor<256x256xf16>) outs(%6 : tensor<256x256xf16>) -> tensor<256x256xf16>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf16> -> !flow.dispatch.tensor<writeonly:tensor<256x256xf16>>
return
}
}
}
}

// CHECK-LABEL: func.func @matmul_256x256x256_f16_f16
// CHECK: scf.for {{.*}} = %c0 to %c256 step %c128 iter_args({{.*}}) -> (vector<2x2x16x1x1x1xf16>)
// Each subgroup handles 2 * 2 tiles, and for each tile we accumulate 8 times
// along the K dimension. So in total 32 wmma ops.
// CHECK-COUNT-32: amdgpu.wmma {{.*}} : vector<16xf16>, vector<16xf16>, vector<16xf16>
// CHECK: scf.yield %{{.+}} : vector<2x2x16x1x1x1xf16>
// Since each subgroup handles 2 * 2 tiles, and for each tile, each lane holds 4 values.
// we will have 32 writes. We cannot do contiguous writes since the outputs columns has interleaved
// thread ids.
// CHECK-COUNT-32: vector.transfer_write {{.+}} {in_bounds = [true, true]} : vector<1x1xf16>, memref<256x256xf16, #hal.descriptor_type<storage_buffer>>
Loading

0 comments on commit fa44a32

Please sign in to comment.