diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/test/llvmcpu_materialize_encoding.mlir b/compiler/src/iree/compiler/Codegen/Common/CPU/test/llvmcpu_materialize_encoding.mlir index ae38ed6c8f957..71ae9826ad84c 100644 --- a/compiler/src/iree/compiler/Codegen/Common/CPU/test/llvmcpu_materialize_encoding.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/CPU/test/llvmcpu_materialize_encoding.mlir @@ -1,11 +1,17 @@ // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-cpu-materialize-device-encoding),canonicalize,cse)" --split-input-file %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @set_encoding_with_padding_semantics_bf16_x86_64_avx512f() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512f"}> }{ %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>>> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>>> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1000], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1000xbf16> %3 = iree_encoding.set_encoding %2 : tensor<1x1000xbf16> -> tensor<1x1000xbf16, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [1, 1000], strides = [1, 1] : tensor<1x1000xbf16, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> -> !flow.dispatch.tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>>> @@ -30,6 +36,12 @@ func.func @set_encoding_with_padding_semantics_bf16_x86_64_avx512f() attributes // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -37,8 +49,8 @@ func.func @set_encoding_7x7x7_matmul_LHS() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx,+avx2,+fma"}> } { %c0 = arith.constant 0 : index - %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> %14 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [7, 7], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<7x7xf32> %17 = iree_encoding.set_encoding %14 : tensor<7x7xf32> -> tensor<7x7xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> flow.dispatch.tensor.store %17, %11, offsets = [0, 0], sizes = [7, 7], strides = [1, 1] : tensor<7x7xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> @@ -55,6 +67,12 @@ func.func @set_encoding_7x7x7_matmul_LHS() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> @@ -62,8 +80,8 @@ func.func @set_encoding_128x80x32_batch_matmul_LHS() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx,+avx2,+fma"}> } { %c0 = arith.constant 0 : index - %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> %14 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0], sizes = [128, 80, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x80x32xf32> %17 = iree_encoding.set_encoding %14 : tensor<128x80x32xf32> -> tensor<128x80x32xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> flow.dispatch.tensor.store %17, %11, offsets = [0, 0, 0], sizes = [128, 80, 32], strides = [1, 1, 1] @@ -72,8 +90,8 @@ func.func @set_encoding_128x80x32_batch_matmul_LHS() attributes { return } // CHECK-LABEL: func @set_encoding_128x80x32_batch_matmul_LHS( -// CHECK: %[[INPUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) {{.*}} !flow.dispatch.tensor> -// CHECK: %[[OUTPUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) {{.*}} !flow.dispatch.tensor> +// CHECK: %[[INPUT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) {{.*}} !flow.dispatch.tensor> +// CHECK: %[[OUTPUT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) {{.*}} !flow.dispatch.tensor> // CHECK: %[[INPUT:.+]] = flow.dispatch.tensor.load %[[INPUT_BINDING]], offsets = [0, 0, 0], sizes = [128, 80, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x80x32xf32> // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<128x10x32x8x1xf32> // CHECK: %[[PACK:.+]] = tensor.pack %[[INPUT]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [8, 1] into %[[EMPTY]] : tensor<128x80x32xf32> -> tensor<128x10x32x8x1xf32> @@ -81,6 +99,12 @@ func.func @set_encoding_128x80x32_batch_matmul_LHS() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> @@ -88,10 +112,10 @@ func.func @set_encoding_128x32x320_batch_matmul_RHS() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx,+avx2,+fma"}> } { %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %5 = arith.index_castui %0 {stream.alignment = 64 : index} : i32 to index - %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %13 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> + %10 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %13 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%5) : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> %16 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0], sizes = [128, 32, 320], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x32x320xf32> %19 = iree_encoding.set_encoding %16 : tensor<128x32x320xf32> -> tensor<128x32x320xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> flow.dispatch.tensor.store %19, %13, offsets = [0, 0, 0], sizes = [128, 32, 320], strides = [1, 1, 1] @@ -100,8 +124,8 @@ func.func @set_encoding_128x32x320_batch_matmul_RHS() attributes { return } // CHECK-LABEL: func @set_encoding_128x32x320_batch_matmul_RHS( -// CHECK: %[[INPUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) {{.*}} !flow.dispatch.tensor> -// CHECK: %[[OUTPUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) {{.*}} !flow.dispatch.tensor> +// CHECK: %[[INPUT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) {{.*}} !flow.dispatch.tensor> +// CHECK: %[[OUTPUT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) {{.*}} !flow.dispatch.tensor> // CHECK: %[[INPUT:.+]] = flow.dispatch.tensor.load %[[INPUT_BINDING]], offsets = [0, 0, 0], sizes = [128, 32, 320], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x32x320xf32> // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<128x40x32x8x1xf32> // CHECK: %[[PACK:.+]] = tensor.pack %[[INPUT]] outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [8, 1] into %[[EMPTY]] : tensor<128x32x320xf32> -> tensor<128x40x32x8x1xf32> @@ -109,6 +133,12 @@ func.func @set_encoding_128x32x320_batch_matmul_RHS() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> @@ -116,10 +146,10 @@ func.func @unset_encoding_128x80x320_batch_matmul_RESULT() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx,+avx2,+fma"}> } { %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %3 = arith.index_castui %0 : i32 to index - %6 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%3) flags(ReadOnly) : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%3) flags(ReadOnly) : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> %10 = flow.dispatch.tensor.load %9, offsets = [0, 0, 0], sizes = [128, 80, 320], strides = [1, 1, 1] : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> -> tensor<128x80x320xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> @@ -129,11 +159,11 @@ func.func @unset_encoding_128x80x320_batch_matmul_RESULT() attributes { } // CHECK-LABEL: func @unset_encoding_128x80x320_batch_matmul_RESULT() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[D0:.+]] = hal.interface.constant.load[0] +// CHECK-DAG: %[[D0:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) // CHECK: %[[CAST:.+]] = arith.index_castui %[[D0]] : i32 to index -// CHECK: %[[OUTPUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%[[C0]]) +// CHECK: %[[OUTPUT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%[[C0]]) // CHECK-SAME: : !flow.dispatch.tensor> -// CHECK: %[[INPUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%[[CAST]]) +// CHECK: %[[INPUT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%[[CAST]]) // CHECK-SAME: : !flow.dispatch.tensor> // CHECK: %[[INPUT:.+]] = flow.dispatch.tensor.load %[[INPUT_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 40, 8, 8], strides = [1, 1, 1, 1, 1] @@ -214,6 +244,13 @@ func.func @matvec_shaped_matmul_lowering_f32f32f32_aarch64(%arg0: !hal.buffer_vi // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -221,14 +258,14 @@ func.func @matmul_lowering_f32f32f32_aarch64() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz", ukernels = "all"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -252,16 +289,16 @@ func.func @matmul_lowering_f32f32f32_aarch64() attributes { // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> // CHECK-LABEL: func @matmul_lowering_f32f32f32_aarch64() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP0]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[K]], 8, 1], strides = [1, 1, 1, 1] @@ -280,20 +317,16 @@ func.func @matmul_lowering_f32f32f32_aarch64() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> -func.func @matvec_lowering_f32f32f32_aarch64(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes { +func.func @matvec_lowering_f32f32f32_aarch64(%arg0: tensor<16x16xf32>, %arg1: tensor<16xf32>, %arg2: tensor<16xf32>) -> tensor<16xf32> attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz"}> } { %c0 = arith.constant 0 : index - %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<16x16xf32> - %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<16xf32> - %2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<16xf32> - %3 = iree_encoding.set_encoding %0 : tensor<16x16xf32> -> tensor<16x16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>> - %4 = iree_encoding.set_encoding %1 : tensor<16xf32> -> tensor<16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>> - %5 = iree_encoding.set_encoding %2 : tensor<16xf32> -> tensor<16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>> + %3 = iree_encoding.set_encoding %arg0 : tensor<16x16xf32> -> tensor<16x16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>> + %4 = iree_encoding.set_encoding %arg1 : tensor<16xf32> -> tensor<16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>> + %5 = iree_encoding.set_encoding %arg2 : tensor<16xf32> -> tensor<16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>> %6 = linalg.matvec ins(%3, %4 : tensor<16x16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>>, tensor<16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>>) outs(%5 : tensor<16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>>) -> tensor<16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>> %7 = iree_encoding.unset_encoding %6 : tensor<16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>> -> tensor<16xf32> - %8 = hal.tensor.export %7 "output0" : tensor<16xf32> -> !hal.buffer_view - func.return %8 : !hal.buffer_view + func.return %7 : tensor<16xf32> } // CHECK-LABEL: func @matvec_lowering_f32f32f32_aarch64( // CHECK: %[[MMT4D:.+]] = linalg.mmt4d @@ -302,6 +335,13 @@ func.func @matvec_lowering_f32f32f32_aarch64(%arg0: !hal.buffer_view, %arg1: !ha // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -309,11 +349,11 @@ func.func @matvec_lowering_f32f32f32_aarch64() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz"}> } { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [16, 16], strides = [1, 1] : !flow.dispatch.tensor>>> @@ -336,11 +376,11 @@ func.func @matvec_lowering_f32f32f32_aarch64() attributes { } // CHECK-LABEL: func @matvec_lowering_f32f32f32_aarch64() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor> -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor> -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor> // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [2, 16, 8, 1], strides = [1, 1, 1, 1] @@ -356,6 +396,13 @@ func.func @matvec_lowering_f32f32f32_aarch64() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -363,14 +410,14 @@ func.func @matmul_lowering_f16f16f16_aarch64() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz", ukernels = "all"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -394,16 +441,16 @@ func.func @matmul_lowering_f16f16f16_aarch64() attributes { // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> // CHECK-LABEL: func @matmul_lowering_f16f16f16_aarch64() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP0]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[K]], 8, 1], strides = [1, 1, 1, 1] @@ -419,6 +466,13 @@ func.func @matmul_lowering_f16f16f16_aarch64() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -426,14 +480,14 @@ func.func @matmul_lowering_f32f32f32_x86_64() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -458,16 +512,16 @@ func.func @matmul_lowering_f32f32f32_x86_64() attributes { // CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 ceildiv 4)> // CHECK-LABEL: func @matmul_lowering_f32f32f32_x86_64() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP1]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[K]], 8, 1], strides = [1, 1, 1, 1] @@ -483,6 +537,13 @@ func.func @matmul_lowering_f32f32f32_x86_64() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -490,14 +551,14 @@ func.func @matmul_lowering_f32f32f32_x86_64_avx2() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -521,16 +582,16 @@ func.func @matmul_lowering_f32f32f32_x86_64_avx2() attributes { // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> // CHECK-LABEL: func @matmul_lowering_f32f32f32_x86_64_avx2() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP0]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[K]], 8, 1], strides = [1, 1, 1, 1] @@ -546,6 +607,13 @@ func.func @matmul_lowering_f32f32f32_x86_64_avx2() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -553,14 +621,14 @@ func.func @matmul_lowering_f32f32f32_x86_64_avx512f() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512f"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -584,16 +652,16 @@ func.func @matmul_lowering_f32f32f32_x86_64_avx512f() attributes { // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> // CHECK-LABEL: func @matmul_lowering_f32f32f32_x86_64_avx512f() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP0]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[K]], 16, 1], strides = [1, 1, 1, 1] @@ -609,6 +677,13 @@ func.func @matmul_lowering_f32f32f32_x86_64_avx512f() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -616,14 +691,14 @@ func.func @matmul_lowering_f16f16f32_x86_64_avx512f() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512f"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -647,16 +722,16 @@ func.func @matmul_lowering_f16f16f32_x86_64_avx512f() attributes { // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> // CHECK-LABEL: func @matmul_lowering_f16f16f32_x86_64_avx512f() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP0]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[K]], 16, 1], strides = [1, 1, 1, 1] @@ -672,6 +747,13 @@ func.func @matmul_lowering_f16f16f32_x86_64_avx512f() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -679,14 +761,14 @@ func.func @matmul_lowering_f16f16f16_x86_64_avx512f() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512f"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -710,16 +792,16 @@ func.func @matmul_lowering_f16f16f16_x86_64_avx512f() attributes { // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> // CHECK-LABEL: func @matmul_lowering_f16f16f16_x86_64_avx512f() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP0]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[K]], 16, 1], strides = [1, 1, 1, 1] @@ -735,6 +817,13 @@ func.func @matmul_lowering_f16f16f16_x86_64_avx512f() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -742,14 +831,14 @@ func.func @matmul_lowering_bf16bf16f32_x86_64_avx512f() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512f"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -773,16 +862,16 @@ func.func @matmul_lowering_bf16bf16f32_x86_64_avx512f() attributes { // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> // CHECK-LABEL: func @matmul_lowering_bf16bf16f32_x86_64_avx512f() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP0]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[K]], 16, 1], strides = [1, 1, 1, 1] @@ -798,6 +887,13 @@ func.func @matmul_lowering_bf16bf16f32_x86_64_avx512f() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -805,14 +901,14 @@ func.func @matmul_lowering_bf16bf16bf16_x86_64_avx512f() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512f"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -836,16 +932,16 @@ func.func @matmul_lowering_bf16bf16bf16_x86_64_avx512f() attributes { // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> // CHECK-LABEL: func @matmul_lowering_bf16bf16bf16_x86_64_avx512f() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP0]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[K]], 16, 1], strides = [1, 1, 1, 1] @@ -861,6 +957,13 @@ func.func @matmul_lowering_bf16bf16bf16_x86_64_avx512f() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -868,14 +971,14 @@ func.func @matmul_lowering_bf16bf16f32_x86_64_avx512bf16() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512f,+avx512bf16"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -900,17 +1003,17 @@ func.func @matmul_lowering_bf16bf16f32_x86_64_avx512bf16() attributes { // CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 ceildiv 2)> // CHECK-LABEL: func @matmul_lowering_bf16bf16f32_x86_64_avx512bf16() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] // CHECK-DAG: %[[TILED_K:.+]] = affine.apply #[[$MAP1]]()[%[[K]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP0]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[TILED_K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[TILED_K]], 16, 2], strides = [1, 1, 1, 1] @@ -926,6 +1029,13 @@ func.func @matmul_lowering_bf16bf16f32_x86_64_avx512bf16() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -933,14 +1043,14 @@ func.func @matmul_lowering_bf16bf16bf16_x86_64_avx512bf16() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512f,+avx512bf16"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -965,17 +1075,17 @@ func.func @matmul_lowering_bf16bf16bf16_x86_64_avx512bf16() attributes { // CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 ceildiv 2)> // CHECK-LABEL: func @matmul_lowering_bf16bf16bf16_x86_64_avx512bf16() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] // CHECK-DAG: %[[TILED_K:.+]] = affine.apply #[[$MAP1]]()[%[[K]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP0]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[TILED_K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[TILED_K]], 16, 2], strides = [1, 1, 1, 1] @@ -991,6 +1101,13 @@ func.func @matmul_lowering_bf16bf16bf16_x86_64_avx512bf16() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -998,14 +1115,14 @@ func.func @matmul_lowering_f32f16f16_aarch64() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz", ukernels = "all"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %lhs_f32 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -1038,14 +1155,14 @@ func.func @matmul_lowering_f32f16f16_aarch64() attributes { // CHECK-DAG: #[[$MAP_CEILDIV_8:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> // CHECK-DAG: #[[$MAP_IDENTITY_4D:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> // CHECK-LABEL: func.func @matmul_lowering_f32f16f16_aarch64() -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] : index -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] : index -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] : index +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) : index +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) : index +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) : index // CHECK-DAG: %[[M_CEILDIV_8:.+]] = affine.apply #[[$MAP_CEILDIV_8]]()[%[[M]]] // CHECK-DAG: %[[N_CEILDIV_8:.+]] = affine.apply #[[$MAP_CEILDIV_8]]()[%[[N]]] -// CHECK-DAG: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) {{.*}} : !flow.dispatch.tensor>{%[[M_CEILDIV_8]], %[[K]]} -// CHECK-DAG: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) {{.*}} : !flow.dispatch.tensor>{%[[N_CEILDIV_8]], %[[K]]} -// CHECK-DAG: %[[OUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) {{.*}} : !flow.dispatch.tensor>{%[[M_CEILDIV_8]], %[[N_CEILDIV_8]]} +// CHECK-DAG: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) {{.*}} : !flow.dispatch.tensor>{%[[M_CEILDIV_8]], %[[K]]} +// CHECK-DAG: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) {{.*}} : !flow.dispatch.tensor>{%[[N_CEILDIV_8]], %[[K]]} +// CHECK-DAG: %[[OUT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) {{.*}} : !flow.dispatch.tensor>{%[[M_CEILDIV_8]], %[[N_CEILDIV_8]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]], offsets = [0, 0, 0, 0], sizes = [%[[M_CEILDIV_8]], %[[K]], 8, 1], {{.*}} -> tensor // CHECK: %[[RHS:.+]] = flow.dispatch.tensor.load %[[RHS_BINDING]], offsets = [0, 0, 0, 0], sizes = [%[[N_CEILDIV_8]], %[[K]], 8, 1], {{.*}} -> tensor // CHECK: %[[OUT:.+]] = flow.dispatch.tensor.load %[[OUT_BINDING]], offsets = [0, 0, 0, 0], sizes = [%[[M_CEILDIV_8]], %[[N_CEILDIV_8]], 8, 8], {{.*}} -> tensor @@ -1056,6 +1173,13 @@ func.func @matmul_lowering_f32f16f16_aarch64() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -1063,14 +1187,14 @@ func.func @matmul_lowering_f32f16f16_x86_64_avx512f() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512f,+avx512bf16"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %lhs_f32 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -1104,14 +1228,14 @@ func.func @matmul_lowering_f32f16f16_x86_64_avx512f() attributes { // CHECK-DAG: #[[$MAP_CEILDIV_16:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> // CHECK-DAG: #[[$MAP_IDENTITY_4D:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> // CHECK-LABEL: func.func @matmul_lowering_f32f16f16_x86_64_avx512f() -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] : index -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] : index -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] : index +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) : index +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) : index +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) : index // CHECK-DAG: %[[M_CEILDIV_16:.+]] = affine.apply #[[$MAP_CEILDIV_16]]()[%[[M]]] // CHECK-DAG: %[[N_CEILDIV_16:.+]] = affine.apply #[[$MAP_CEILDIV_16]]()[%[[N]]] -// CHECK-DAG: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) {{.*}} : !flow.dispatch.tensor>{%[[M_CEILDIV_16]], %[[K]]} -// CHECK-DAG: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) {{.*}} : !flow.dispatch.tensor>{%[[N_CEILDIV_16]], %[[K]]} -// CHECK-DAG: %[[OUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) {{.*}} : !flow.dispatch.tensor>{%[[M_CEILDIV_16]], %[[N_CEILDIV_16]]} +// CHECK-DAG: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) {{.*}} : !flow.dispatch.tensor>{%[[M_CEILDIV_16]], %[[K]]} +// CHECK-DAG: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) {{.*}} : !flow.dispatch.tensor>{%[[N_CEILDIV_16]], %[[K]]} +// CHECK-DAG: %[[OUT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) {{.*}} : !flow.dispatch.tensor>{%[[M_CEILDIV_16]], %[[N_CEILDIV_16]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]], offsets = [0, 0, 0, 0], sizes = [%[[M_CEILDIV_16]], %[[K]], 16, 1], {{.*}} -> tensor // CHECK: %[[RHS:.+]] = flow.dispatch.tensor.load %[[RHS_BINDING]], offsets = [0, 0, 0, 0], sizes = [%[[N_CEILDIV_16]], %[[K]], 16, 1], {{.*}} -> tensor // CHECK: %[[OUT:.+]] = flow.dispatch.tensor.load %[[OUT_BINDING]], offsets = [0, 0, 0, 0], sizes = [%[[M_CEILDIV_16]], %[[N_CEILDIV_16]], 16, 16], {{.*}} -> tensor @@ -1122,6 +1246,13 @@ func.func @matmul_lowering_f32f16f16_x86_64_avx512f() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -1129,14 +1260,14 @@ func.func @matmul_lowering_i8i8i32_aarch64() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -1159,14 +1290,14 @@ func.func @matmul_lowering_i8i8i32_aarch64() attributes { } // CHECK-LABEL: func @matmul_lowering_i8i8i32_aarch64() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[M]], %[[K]]} -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[K]], %[[N]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[M]], %[[N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0], sizes = [%[[M]], %[[K]]], strides = [1, 1] @@ -1182,6 +1313,13 @@ func.func @matmul_lowering_i8i8i32_aarch64() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -1189,14 +1327,14 @@ func.func @matmul_lowering_i8i8i32_aarch64_dotprod() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz", cpu_features="+dotprod", ukernels = "all"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -1221,17 +1359,17 @@ func.func @matmul_lowering_i8i8i32_aarch64_dotprod() attributes { // CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 ceildiv 4)> // CHECK-LABEL: func @matmul_lowering_i8i8i32_aarch64_dotprod() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] // CHECK-DAG: %[[TILED_K:.+]] = affine.apply #[[$MAP1]]()[%[[K]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP0]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[TILED_K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[TILED_K]], 8, 4], strides = [1, 1, 1, 1] @@ -1247,6 +1385,13 @@ func.func @matmul_lowering_i8i8i32_aarch64_dotprod() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -1254,14 +1399,14 @@ func.func @matmul_lowering_i8i8i32_aarch64_i8mm() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz", cpu_features="+dotprod,+i8mm", ukernels = "all"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -1285,17 +1430,17 @@ func.func @matmul_lowering_i8i8i32_aarch64_i8mm() attributes { // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> // CHECK-LABEL: func @matmul_lowering_i8i8i32_aarch64_i8mm() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] // CHECK-DAG: %[[TILED_K:.+]] = affine.apply #[[$MAP0]]()[%[[K]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP0]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[TILED_K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[TILED_K]], 8, 8], strides = [1, 1, 1, 1] @@ -1311,6 +1456,13 @@ func.func @matmul_lowering_i8i8i32_aarch64_i8mm() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -1318,14 +1470,14 @@ func.func @matmul_lowering_i8i4i32_aarch64() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -1351,17 +1503,17 @@ func.func @matmul_lowering_i8i4i32_aarch64() attributes { // CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> // CHECK-LABEL: func @matmul_lowering_i8i4i32_aarch64() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] // CHECK: %[[TILED_K:.+]] = affine.apply #[[$MAP1]]()[%[[K]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP2]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[TILED_K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[TILED_K]], 4, 2], strides = [1, 1, 1, 1] @@ -1377,6 +1529,13 @@ func.func @matmul_lowering_i8i4i32_aarch64() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -1384,14 +1543,14 @@ func.func @matmul_lowering_i8i4i32_aarch64_dotprod() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz", cpu_features="+dotprod", ukernels = "all"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -1415,17 +1574,17 @@ func.func @matmul_lowering_i8i4i32_aarch64_dotprod() attributes { // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> // CHECK-LABEL: func @matmul_lowering_i8i4i32_aarch64_dotprod() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] // CHECK-DAG: %[[TILED_K:.+]] = affine.apply #[[$MAP0]]()[%[[K]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP0]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[TILED_K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[TILED_K]], 8, 8], strides = [1, 1, 1, 1] @@ -1441,6 +1600,13 @@ func.func @matmul_lowering_i8i4i32_aarch64_dotprod() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -1448,14 +1614,14 @@ func.func @matmul_lowering_i8i4i32_aarch64_i8mm() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz", cpu_features="+dotprod,+i8mm", ukernels = "all"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -1481,17 +1647,17 @@ func.func @matmul_lowering_i8i4i32_aarch64_i8mm() attributes { // CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> // CHECK-LABEL: func @matmul_lowering_i8i4i32_aarch64_i8mm() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] // CHECK-DAG: %[[TILED_K:.+]] = affine.apply #[[$MAP1]]()[%[[K]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP2]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[TILED_K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[TILED_K]], 4, 16], strides = [1, 1, 1, 1] @@ -1510,7 +1676,7 @@ func.func @matmul_lowering_i8i4i32_aarch64_i8mm() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> -func.func @matmul_lowering_f32f32f32_aarch64_sve(%lhs : tensor, %rhs: tensor, %acc: tensor) -> tensor attributes { +func.func @matmul_lowering_f32f32f32_aarch64_sve(%lhs: tensor, %rhs: tensor, %acc: tensor) -> tensor attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {cpu_features = "+sve", target_triple="aarch64-xyz-xyz"}> } { %0 = iree_encoding.set_encoding %lhs : tensor -> tensor>> @@ -1535,7 +1701,7 @@ func.func @matmul_lowering_f32f32f32_aarch64_sve(%lhs : tensor, %rhs: t #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> -func.func @matmul_lowering_f32f32f32_riscv(%lhs : tensor, %rhs: tensor, %acc: tensor) -> tensor attributes { +func.func @matmul_lowering_f32f32f32_riscv(%lhs: tensor, %rhs: tensor, %acc: tensor) -> tensor attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="riscv32-xyz-xyz"}> } { %0 = iree_encoding.set_encoding %lhs : tensor -> tensor>> @@ -1556,6 +1722,13 @@ func.func @matmul_lowering_f32f32f32_riscv(%lhs : tensor, %rhs: tensor< // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -1563,14 +1736,14 @@ func.func @matmul_lowering_i8i8i32_riscv32_ukernel() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="riscv32-xyz-xyz", ukernels = "all"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -1595,17 +1768,17 @@ func.func @matmul_lowering_i8i8i32_riscv32_ukernel() attributes { // CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 ceildiv 4)> // CHECK-LABEL: func @matmul_lowering_i8i8i32_riscv32_ukernel() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] // CHECK-DAG: %[[TILED_K:.+]] = affine.apply #[[$MAP1]]()[%[[K]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP0]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[TILED_K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[TILED_K]], 8, 4], strides = [1, 1, 1, 1] @@ -1621,6 +1794,13 @@ func.func @matmul_lowering_i8i8i32_riscv32_ukernel() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -1628,14 +1808,14 @@ func.func @matmul_lowering_i8i8i32_x86_64_avx2() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx2"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -1660,17 +1840,17 @@ func.func @matmul_lowering_i8i8i32_x86_64_avx2() attributes { // CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 ceildiv 2)> // CHECK-LABEL: func @matmul_lowering_i8i8i32_x86_64_avx2() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] // CHECK-DAG: %[[TILED_K:.+]] = affine.apply #[[$MAP1]]()[%[[K]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP0]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[TILED_K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[TILED_K]], 8, 2], strides = [1, 1, 1, 1] @@ -1686,6 +1866,13 @@ func.func @matmul_lowering_i8i8i32_x86_64_avx2() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -1693,14 +1880,14 @@ func.func @matmul_lowering_i8i8i32_x86_64_avx512bw() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512bw"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -1725,17 +1912,17 @@ func.func @matmul_lowering_i8i8i32_x86_64_avx512bw() attributes { // CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 ceildiv 2)> // CHECK-LABEL: func @matmul_lowering_i8i8i32_x86_64_avx512bw() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] // CHECK-DAG: %[[TILED_K:.+]] = affine.apply #[[$MAP1]]()[%[[K]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP0]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[TILED_K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[TILED_K]], 16, 2], strides = [1, 1, 1, 1] @@ -1751,6 +1938,13 @@ func.func @matmul_lowering_i8i8i32_x86_64_avx512bw() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -1758,14 +1952,14 @@ func.func @matmul_lowering_i8i8i32_x86_64_avx512vnni() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -1790,17 +1984,17 @@ func.func @matmul_lowering_i8i8i32_x86_64_avx512vnni() attributes { // CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 ceildiv 2)> // CHECK-LABEL: func @matmul_lowering_i8i8i32_x86_64_avx512vnni() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] // CHECK-DAG: %[[TILED_K:.+]] = affine.apply #[[$MAP1]]()[%[[K]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP0]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[TILED_K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[TILED_K]], 16, 2], strides = [1, 1, 1, 1] @@ -1871,6 +2065,13 @@ func.func @extend_batch_vecmat_explicit_unit_dim(%arg0: tensor<32x1x128xi8>, %ar // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -1878,14 +2079,14 @@ func.func @matmul_lowering_i16i16i32_x86_64_avx2() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx2"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -1910,17 +2111,17 @@ func.func @matmul_lowering_i16i16i32_x86_64_avx2() attributes { // CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 ceildiv 2)> // CHECK-LABEL: func @matmul_lowering_i16i16i32_x86_64_avx2() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[$MAP0]]()[%[[M]]] // CHECK-DAG: %[[TILED_K:.+]] = affine.apply #[[$MAP1]]()[%[[K]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[$MAP0]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[TILED_K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[TILED_K]], 8, 2], strides = [1, 1, 1, 1] @@ -1936,6 +2137,13 @@ func.func @matmul_lowering_i16i16i32_x86_64_avx2() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -1943,14 +2151,14 @@ func.func @matmul_lowering_i16ui4i32_x86_64_avx512vnni() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %lhs_binding = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %rhs_binding = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %out_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %out_binding = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -1983,14 +2191,14 @@ func.func @matmul_lowering_i16ui4i32_x86_64_avx512vnni() attributes { // CHECK-DAG: #[[$MAP_CEILDIV_32:.+]] = affine_map<()[s0] -> (s0 ceildiv 32)> // CHECK-DAG: #[[$MAP_IDENTITY_4D:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> // CHECK-LABEL: func.func @matmul_lowering_i16ui4i32_x86_64_avx512vnni() -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] : index -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] : index -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] : index +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) : index +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) : index +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) : index // CHECK-DAG: %[[K_CEILDIV_8:.+]] = affine.apply #[[$MAP_CEILDIV_8]]()[%[[K]]] // CHECK-DAG: %[[N_CEILDIV_32:.+]] = affine.apply #[[$MAP_CEILDIV_32]]()[%[[N]]] -// CHECK-DAG: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) {{.*}} : !flow.dispatch.tensor>{%[[M]], %[[K_CEILDIV_8]]} -// CHECK-DAG: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) {{.*}} : !flow.dispatch.tensor>{%[[N_CEILDIV_32]], %[[K_CEILDIV_8]]} -// CHECK-DAG: %[[OUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) {{.*}} : !flow.dispatch.tensor>{%[[M]], %[[N_CEILDIV_32]]} +// CHECK-DAG: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) {{.*}} : !flow.dispatch.tensor>{%[[M]], %[[K_CEILDIV_8]]} +// CHECK-DAG: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) {{.*}} : !flow.dispatch.tensor>{%[[N_CEILDIV_32]], %[[K_CEILDIV_8]]} +// CHECK-DAG: %[[OUT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) {{.*}} : !flow.dispatch.tensor>{%[[M]], %[[N_CEILDIV_32]]} // CHECK-DAG: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]], offsets = [0, 0, 0, 0], sizes = [%[[M]], %[[K_CEILDIV_8]], 1, 8], {{.*}} -> tensor // CHECK-DAG: %[[RHS:.+]] = flow.dispatch.tensor.load %[[RHS_BINDING]], offsets = [0, 0, 0, 0], sizes = [%[[N_CEILDIV_32]], %[[K_CEILDIV_8]], 32, 8], {{.*}} -> tensor // CHECK-DAG: %[[OUT:.+]] = flow.dispatch.tensor.load %[[OUT_BINDING]], offsets = [0, 0, 0, 0], sizes = [%[[M]], %[[N_CEILDIV_32]], 1, 32], {{.*}} -> tensor diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/test/vmvx_materialize_encoding.mlir b/compiler/src/iree/compiler/Codegen/Common/CPU/test/vmvx_materialize_encoding.mlir index 52d78ca65a2ac..1e8dcba241e39 100644 --- a/compiler/src/iree/compiler/Codegen/Common/CPU/test/vmvx_materialize_encoding.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/CPU/test/vmvx_materialize_encoding.mlir @@ -1,5 +1,12 @@ // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-cpu-materialize-device-encoding),canonicalize,cse)" --split-input-file %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -7,14 +14,14 @@ func.func @matmul_lowering_i8i8i32_vmvx_ukernel() attributes { hal.executable.target = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = "all"}> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -39,23 +46,23 @@ func.func @matmul_lowering_i8i8i32_vmvx_ukernel() attributes { // CHECK-DAG: #[[MAP_CEILDIV:.+]] = affine_map<()[s0, s1] -> (s0 ceildiv s1)> // CHECK: func @matmul_lowering_i8i8i32_vmvx_ukernel() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK: %[[LHS_TILE_SIZES:.+]]:2 = iree_codegen.query_tile_sizes tensor>> -> index, index // CHECK-DAG: %[[LHS_OUTER_SIZE0:.+]] = affine.apply #[[MAP_CEILDIV]]()[%[[M]], %[[LHS_TILE_SIZES]]#0] // CHECK-DAG: %[[LHS_OUTER_SIZE1:.+]] = affine.apply #[[MAP_CEILDIV]]()[%[[K]], %[[LHS_TILE_SIZES]]#1] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[LHS_OUTER_SIZE0]], %[[LHS_OUTER_SIZE1]], %[[LHS_TILE_SIZES]]#0, %[[LHS_TILE_SIZES]]#1} // CHECK: %[[RHS_TILE_SIZES:.+]]:2 = iree_codegen.query_tile_sizes tensor>> -> index, index // CHECK-DAG: %[[RHS_OUTER_SIZE0:.+]] = affine.apply #[[MAP_CEILDIV]]()[%[[N]], %[[RHS_TILE_SIZES]]#0] // CHECK-DAG: %[[RHS_OUTER_SIZE1:.+]] = affine.apply #[[MAP_CEILDIV]]()[%[[K]], %[[RHS_TILE_SIZES]]#1] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[RHS_OUTER_SIZE0]], %[[RHS_OUTER_SIZE1]], %[[RHS_TILE_SIZES]]#0, %[[RHS_TILE_SIZES]]#1} // CHECK: %[[RESULT_TILE_SIZES:.+]]:2 = iree_codegen.query_tile_sizes tensor>> -> index, index // CHECK-DAG: %[[RESULT_OUTER_SIZE0:.+]] = affine.apply #[[MAP_CEILDIV]]()[%[[M]], %[[RESULT_TILE_SIZES]]#0] // CHECK-DAG: %[[RESULT_OUTER_SIZE1:.+]] = affine.apply #[[MAP_CEILDIV]]()[%[[N]], %[[RESULT_TILE_SIZES]]#1] -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[RESULT_OUTER_SIZE0]], %[[RESULT_OUTER_SIZE1]], %[[RESULT_TILE_SIZES]]#0, %[[RESULT_TILE_SIZES]]#1} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[LHS_OUTER_SIZE0]], %[[LHS_OUTER_SIZE1]], %[[LHS_TILE_SIZES]]#0, %[[LHS_TILE_SIZES]]#1], strides = [1, 1, 1, 1] @@ -71,6 +78,13 @@ func.func @matmul_lowering_i8i8i32_vmvx_ukernel() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<()[s0] -> ((3 ceildiv s0) * s0)> #map1 = affine_map<()[s0] -> ((1 ceildiv s0) * s0)> #map2 = affine_map<(d0, d1, d2) -> (d0, d2)> @@ -82,9 +96,9 @@ func.func @fill_matmul(%arg0: index, %arg1: index, %arg2: index, %arg3: index, % %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>>{%arg4, %arg5} + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>>{%arg4, %arg5} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 2], strides = [1, 1] : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>> -> tensor<1x2xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>> -> tensor<2x3xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>> %7 = tensor.empty() : tensor<1x3xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>> @@ -95,11 +109,11 @@ func.func @fill_matmul(%arg0: index, %arg1: index, %arg2: index, %arg3: index, % } // CHECK: func.func @fill_matmul // CHECK-DAG: %[[ZERO:.+]] = arith.constant 0.000000e+00 : f32 -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor> -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor> -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor> // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [1, 1, 8, 4], strides = [1, 1, 1, 1] @@ -117,6 +131,12 @@ func.func @fill_matmul(%arg0: index, %arg1: index, %arg2: index, %arg3: index, % // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -124,11 +144,11 @@ func.func @set_encoding_dynamic() attributes { hal.executable.target = #hal.executable.target<"vmvx", "vmvx-bytecode-fb"> } { %c0 = arith.constant 0 : index - %d0 = hal.interface.constant.load [0] : index - %d1 = hal.interface.constant.load [1] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %d0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %d1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%d0, %d1} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%d0, %d1} %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%d0, %d1], strides = [1, 1] : !flow.dispatch.tensor>{%d0, %d1} -> tensor @@ -143,12 +163,12 @@ func.func @set_encoding_dynamic() attributes { // CHECK: func @set_encoding_dynamic() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[CST:.+]] = arith.constant 0.0 -// CHECK-DAG: %[[D0:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[D1:.+]] = hal.interface.constant.load[1] -// CHECK: %[[INPUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK-DAG: %[[D0:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[D1:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK: %[[INPUT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-DAG: %[[TILED_D0:.+]] = affine.apply #[[MAP0]]()[%[[D0]]] // CHECK-DAG: %[[TILED_D1:.+]] = affine.apply #[[MAP1]]()[%[[D1]]] -// CHECK-DAG: %[[OUTPUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[OUTPUT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_D0]], %[[TILED_D1]]} // CHECK: %[[INPUT:.+]] = flow.dispatch.tensor.load %[[INPUT_BINDING]] // CHECK: %[[EMPTY:.+]] = tensor.empty @@ -160,6 +180,12 @@ func.func @set_encoding_dynamic() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -168,11 +194,11 @@ func.func @unset_encoding_dynamic() attributes { } { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %d0 = hal.interface.constant.load [0] : index - %d1 = hal.interface.constant.load [1] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %d0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %d1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%d0, %d1} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%d0, %d1} %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%d0, %d1], strides = [1, 1] : !flow.dispatch.tensor>>>{%d0, %d1} @@ -188,13 +214,13 @@ func.func @unset_encoding_dynamic() attributes { // CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 ceildiv 4)> // CHECK: func @unset_encoding_dynamic() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[D0:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[D1:.+]] = hal.interface.constant.load[1] +// CHECK-DAG: %[[D0:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[D1:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) // CHECK-DAG: %[[TILED_D0:.+]] = affine.apply #[[MAP0]]()[%[[D0]]] // CHECK-DAG: %[[TILED_D1:.+]] = affine.apply #[[MAP1]]()[%[[D1]]] -// CHECK-DAG: %[[INPUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK-DAG: %[[INPUT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_D0]], %[[TILED_D1]]} -// CHECK-DAG: %[[OUTPUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[OUTPUT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK: %[[INPUT:.+]] = flow.dispatch.tensor.load %[[INPUT_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_D0]], %[[TILED_D1]], 8, 4], strides = [1, 1, 1, 1] // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[D0]], %[[D1]]) @@ -204,6 +230,13 @@ func.func @unset_encoding_dynamic() attributes { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -211,14 +244,14 @@ func.func @matmul_lowering_f32f32f32_generic() attributes { hal.executable.target = #hal.executable.target<"vmvx", "vmvx-bytecode-fb"> } { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %K} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%K, %N} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] : !flow.dispatch.tensor>>>{%M, %K} @@ -243,17 +276,17 @@ func.func @matmul_lowering_f32f32f32_generic() attributes { // CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 ceildiv 4)> // CHECK: func @matmul_lowering_f32f32f32_generic() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) // CHECK-DAG: %[[TILED_M:.+]] = affine.apply #[[MAP0]]()[%[[M]]] // CHECK-DAG: %[[TILED_K:.+]] = affine.apply #[[MAP1]]()[%[[K]]] -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_K]]} // CHECK: %[[TILED_N:.+]] = affine.apply #[[MAP0]]()[%[[N]]] -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_N]], %[[TILED_K]]} -// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_M]], %[[TILED_N]]} // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_M]], %[[TILED_K]], 8, 4], strides = [1, 1, 1, 1] diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir index 47a7fce5c0b21..1a9b13195ca63 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir @@ -2,14 +2,21 @@ // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{tiling-level=thread}, canonicalize, cse))" %s | FileCheck %s --check-prefix=THREAD // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{tiling-level=subgroup}, canonicalize, cse))" %s | FileCheck %s --check-prefix=SUBGROUP +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_gpu.lowering_config<{thread = [2, 16], subgroup = [2, 16]}> #map = affine_map<(d0, d1) -> (d0, d1)> module { func.func @add_tensor() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [%c0, %c0], sizes = [64, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x256xf32> %4 = flow.dispatch.tensor.load %1, offsets = [%c0, %c0], sizes = [64, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x256xf32> %5 = flow.dispatch.tensor.load %2, offsets = [%c0, %c0], sizes = [64, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x256xf32> @@ -44,14 +51,21 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_gpu.lowering_config<{thread = [0, 16]}> #map = affine_map<(d0, d1) -> (d0, d1)> module { func.func @sequential_forall_mappings() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4x256xf32> %4 = flow.dispatch.tensor.load %1, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4x256xf32> %5 = flow.dispatch.tensor.load %2, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4x256xf32> @@ -80,38 +94,43 @@ module { // ----- -module { - func.func @matmul_transpose_b() attributes {translation_info = #iree_codegen.translation_info} { - %c4 = arith.constant 4 : index - %c1280 = arith.constant 1280 : index - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] - %5 = flow.dispatch.tensor.load %2, offsets = [%3, %4], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x64xf32> - %6 = flow.dispatch.tensor.load %0, offsets = [%3, 0], sizes = [64, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x1280xf16> - %7 = flow.dispatch.tensor.load %1, offsets = [%4, 0], sizes = [64, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x1280xf16> - %8 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x64xf32>) -> tensor<64x64xf32> - %9 = tensor.empty() : tensor<64x1280xf16> - %10 = tensor.empty() : tensor<64x1280xf16> - %11 = scf.for %arg0 = %c0 to %c1280 step %c4 iter_args(%arg1 = %8) -> (tensor<64x64xf32>) { - %extracted_slice = tensor.extract_slice %6[0, %arg0] [64, 4] [1, 1] : tensor<64x1280xf16> to tensor<64x4xf16> - %extracted_slice_0 = tensor.extract_slice %9[0, %arg0] [64, 4] [1, 1] : tensor<64x1280xf16> to tensor<64x4xf16> - %12 = linalg.copy {lowering_config = #iree_gpu.lowering_config<{thread = [1, 1]}>} ins(%extracted_slice : tensor<64x4xf16>) outs(%extracted_slice_0 : tensor<64x4xf16>) -> tensor<64x4xf16> - %extracted_slice_1 = tensor.extract_slice %7[0, %arg0] [64, 4] [1, 1] : tensor<64x1280xf16> to tensor<64x4xf16> - %extracted_slice_2 = tensor.extract_slice %10[0, %arg0] [64, 4] [1, 1] : tensor<64x1280xf16> to tensor<64x4xf16> - %13 = linalg.copy {lowering_config = #iree_gpu.lowering_config<{thread = [1, 1]}>} ins(%extracted_slice_1 : tensor<64x4xf16>) outs(%extracted_slice_2 : tensor<64x4xf16>) -> tensor<64x4xf16> - %14 = linalg.matmul_transpose_b {lowering_config = #iree_gpu.lowering_config<{thread = [4, 4]}>} ins(%12, %13 : tensor<64x4xf16>, tensor<64x4xf16>) outs(%arg1 : tensor<64x64xf32>) -> tensor<64x64xf32> - scf.yield %14 : tensor<64x64xf32> - } - flow.dispatch.tensor.store %11, %2, offsets = [%3, %4], sizes = [64, 64], strides = [1, 1] : tensor<64x64xf32> -> !flow.dispatch.tensor> - return +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_transpose_b() attributes {translation_info = #iree_codegen.translation_info} { + %c4 = arith.constant 4 : index + %c1280 = arith.constant 1280 : index + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] + %5 = flow.dispatch.tensor.load %2, offsets = [%3, %4], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x64xf32> + %6 = flow.dispatch.tensor.load %0, offsets = [%3, 0], sizes = [64, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x1280xf16> + %7 = flow.dispatch.tensor.load %1, offsets = [%4, 0], sizes = [64, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x1280xf16> + %8 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x64xf32>) -> tensor<64x64xf32> + %9 = tensor.empty() : tensor<64x1280xf16> + %10 = tensor.empty() : tensor<64x1280xf16> + %11 = scf.for %arg0 = %c0 to %c1280 step %c4 iter_args(%arg1 = %8) -> (tensor<64x64xf32>) { + %extracted_slice = tensor.extract_slice %6[0, %arg0] [64, 4] [1, 1] : tensor<64x1280xf16> to tensor<64x4xf16> + %extracted_slice_0 = tensor.extract_slice %9[0, %arg0] [64, 4] [1, 1] : tensor<64x1280xf16> to tensor<64x4xf16> + %12 = linalg.copy {lowering_config = #iree_gpu.lowering_config<{thread = [1, 1]}>} ins(%extracted_slice : tensor<64x4xf16>) outs(%extracted_slice_0 : tensor<64x4xf16>) -> tensor<64x4xf16> + %extracted_slice_1 = tensor.extract_slice %7[0, %arg0] [64, 4] [1, 1] : tensor<64x1280xf16> to tensor<64x4xf16> + %extracted_slice_2 = tensor.extract_slice %10[0, %arg0] [64, 4] [1, 1] : tensor<64x1280xf16> to tensor<64x4xf16> + %13 = linalg.copy {lowering_config = #iree_gpu.lowering_config<{thread = [1, 1]}>} ins(%extracted_slice_1 : tensor<64x4xf16>) outs(%extracted_slice_2 : tensor<64x4xf16>) -> tensor<64x4xf16> + %14 = linalg.matmul_transpose_b {lowering_config = #iree_gpu.lowering_config<{thread = [4, 4]}>} ins(%12, %13 : tensor<64x4xf16>, tensor<64x4xf16>) outs(%arg1 : tensor<64x64xf32>) -> tensor<64x64xf32> + scf.yield %14 : tensor<64x64xf32> } + flow.dispatch.tensor.store %11, %2, offsets = [%3, %4], sizes = [64, 64], strides = [1, 1] : tensor<64x64xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @matmul_transpose_b @@ -129,30 +148,34 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #config = #iree_gpu.lowering_config<{reduction = [0, 8]}> #map = affine_map<()[s0] -> (s0 * 64)> #map1 = affine_map<(d0, d1) -> (d0, d1)> #map2 = affine_map<(d0, d1) -> (d0)> -module { - func.func @reduction() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x384xf32> - %empty = tensor.empty() : tensor<128xf32> - %4 = linalg.fill ins(%cst : f32) outs(%empty : tensor<128xf32>) -> tensor<128xf32> - %5 = linalg.generic { - indexing_maps = [#map1, #map2], - iterator_types = ["parallel", "reduction"] - } ins(%3 : tensor<128x384xf32>) outs(%4 : tensor<128xf32>) attrs = {lowering_config = #config} { - ^bb0(%in: f32, %out: f32): - %7 = arith.addf %in, %out : f32 - linalg.yield %7 : f32 - } -> tensor<128xf32> - flow.dispatch.tensor.store %5, %1, offsets = [%c0], sizes = [128], strides = [1] : tensor<128xf32> -> !flow.dispatch.tensor> - return - } +func.func @reduction() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x384xf32> + %empty = tensor.empty() : tensor<128xf32> + %4 = linalg.fill ins(%cst : f32) outs(%empty : tensor<128xf32>) -> tensor<128xf32> + %5 = linalg.generic { + indexing_maps = [#map1, #map2], + iterator_types = ["parallel", "reduction"] + } ins(%3 : tensor<128x384xf32>) outs(%4 : tensor<128xf32>) attrs = {lowering_config = #config} { + ^bb0(%in: f32, %out: f32): + %7 = arith.addf %in, %out : f32 + linalg.yield %7 : f32 + } -> tensor<128xf32> + flow.dispatch.tensor.store %5, %1, offsets = [%c0], sizes = [128], strides = [1] : tensor<128xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @reduction @@ -167,31 +190,36 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_gpu.lowering_config<{reduction = [0, 0, 8]}> #map = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @matmul_fuse() { - %c0 = arith.constant 0 : index - %cst = arith.constant 1.0 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [%c0, %c0], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x64xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [%c0, %c0], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x64xf32> - %5 = flow.dispatch.tensor.load %2, offsets = [%c0, %c0], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x64xf32> - %empty = tensor.empty() : tensor<64x64xf32> - %6 = linalg.generic { - indexing_maps = [#map, #map], - iterator_types = ["parallel", "parallel"] - } ins(%3 : tensor<64x64xf32>) outs(%empty : tensor<64x64xf32>) { - ^bb0(%in: f32, %out: f32): - %8 = arith.addf %in, %cst : f32 - linalg.yield %8 : f32 - } -> tensor<64x64xf32> - %7 = linalg.matmul {lowering_config = #config} ins(%6, %4 : tensor<64x64xf32>, tensor<64x64xf32>) outs(%5 : tensor<64x64xf32>) -> tensor<64x64xf32> - flow.dispatch.tensor.store %7, %2, offsets = [%c0, %c0], sizes = [64, 64], strides = [1, 1] : tensor<64x64xf32> -> !flow.dispatch.tensor> - return - } +func.func @matmul_fuse() { + %c0 = arith.constant 0 : index + %cst = arith.constant 1.0 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [%c0, %c0], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x64xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [%c0, %c0], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x64xf32> + %5 = flow.dispatch.tensor.load %2, offsets = [%c0, %c0], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x64xf32> + %empty = tensor.empty() : tensor<64x64xf32> + %6 = linalg.generic { + indexing_maps = [#map, #map], + iterator_types = ["parallel", "parallel"] + } ins(%3 : tensor<64x64xf32>) outs(%empty : tensor<64x64xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = arith.addf %in, %cst : f32 + linalg.yield %8 : f32 + } -> tensor<64x64xf32> + %7 = linalg.matmul {lowering_config = #config} ins(%6, %4 : tensor<64x64xf32>, tensor<64x64xf32>) outs(%5 : tensor<64x64xf32>) -> tensor<64x64xf32> + flow.dispatch.tensor.store %7, %2, offsets = [%c0, %c0], sizes = [64, 64], strides = [1, 1] : tensor<64x64xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @matmul_fuse @@ -201,32 +229,37 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_gpu.lowering_config<{thread = [8, 8]}> -module { - func.func @matmul_cleanup() { - %c8 = arith.constant 8 : index - %c64 = arith.constant 64 : index - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x64xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x64xf32> - %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x64xf32> - %6 = scf.for %arg0 = %c0 to %c64 step %c8 iter_args(%arg1 = %5) -> (tensor<64x64xf32>) { - %extracted_slice = tensor.extract_slice %3[0, %arg0] [64, 8] [1, 1] : tensor<64x64xf32> to tensor<64x8xf32> - %extracted_slice_0 = tensor.extract_slice %4[%arg0, 0] [8, 64] [1, 1] : tensor<64x64xf32> to tensor<8x64xf32> - %7 = linalg.matmul {lowering_config = #config} ins(%extracted_slice, %extracted_slice_0 : tensor<64x8xf32>, tensor<8x64xf32>) outs(%arg1 : tensor<64x64xf32>) -> tensor<64x64xf32> - scf.yield %7 : tensor<64x64xf32> - } - flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [64, 64], strides = [1, 1] : tensor<64x64xf32> -> !flow.dispatch.tensor> - return +func.func @matmul_cleanup() { + %c8 = arith.constant 8 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x64xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x64xf32> + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [64, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x64xf32> + %6 = scf.for %arg0 = %c0 to %c64 step %c8 iter_args(%arg1 = %5) -> (tensor<64x64xf32>) { + %extracted_slice = tensor.extract_slice %3[0, %arg0] [64, 8] [1, 1] : tensor<64x64xf32> to tensor<64x8xf32> + %extracted_slice_0 = tensor.extract_slice %4[%arg0, 0] [8, 64] [1, 1] : tensor<64x64xf32> to tensor<8x64xf32> + %7 = linalg.matmul {lowering_config = #config} ins(%extracted_slice, %extracted_slice_0 : tensor<64x8xf32>, tensor<8x64xf32>) outs(%arg1 : tensor<64x64xf32>) -> tensor<64x64xf32> + scf.yield %7 : tensor<64x64xf32> } + flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [64, 64], strides = [1, 1] : tensor<64x64xf32> -> !flow.dispatch.tensor> + return } // THREAD-LABEL: func.func @matmul_cleanup -// THREAD: %[[B0:.+]] = hal.interface.binding.subspan set(0) binding(0) -// THREAD: %[[B1:.+]] = hal.interface.binding.subspan set(0) binding(1) +// THREAD: %[[B0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// THREAD: %[[B1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // THREAD: %[[A:.+]] = flow.dispatch.tensor.load %[[B0]] // THREAD: %[[B:.+]] = flow.dispatch.tensor.load %[[B1]] // THREAD: scf.for %{{.*}} = %c0 to %c64 step %c8 @@ -237,15 +270,22 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_gpu.derived_thread_config #map = affine_map<(d0, d1) -> (d0, d1)> module { func.func @inferred_add_tensor() attributes {translation_info = #iree_codegen.translation_info} { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [%c0, %c0], sizes = [64, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x256xf32> %4 = flow.dispatch.tensor.load %1, offsets = [%c0, %c0], sizes = [64, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x256xf32> %5 = flow.dispatch.tensor.load %2, offsets = [%c0, %c0], sizes = [64, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x256xf32> @@ -274,13 +314,20 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_gpu.derived_thread_config module { func.func @inferred_im2col() attributes {translation_info = #iree_codegen.translation_info} { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [%c0, %c0, %c0, %c0], sizes = [2, 34, 34, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x34x34x128xf16> %3 = flow.dispatch.tensor.load %1, offsets = [%c0, %c0, %c0], sizes = [2, 128, 8], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x128x8xf16> %4 = iree_linalg_ext.im2col {lowering_config = #config} strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3] m_offset = [0] k_offset = [0] batch_pos = [0] m_pos = [2, 3] k_pos = [1] ins(%2 : tensor<2x34x34x128xf16>) outs(%3 : tensor<2x128x8xf16>) -> tensor<2x128x8xf16> diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_create_fast_slow_path.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_create_fast_slow_path.mlir index 44d92b14fef7b..126c6f582638a 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_create_fast_slow_path.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_create_fast_slow_path.mlir @@ -1,14 +1,22 @@ // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-create-fast-slow-path))" --mlir-print-local-scope %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> func.func @padded_conv() { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %c32 = arith.constant 32 : index %c112 = arith.constant 112 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute.mlir index 84f3cbf1b10f5..0831e5b75c7e0 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute.mlir @@ -1,36 +1,41 @@ // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-distribute, cse))" %s --split-input-file | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<()[s0] -> (s0 * 256)> #map1 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)> #map2 = affine_map<(d0) -> (d0 * 4)> #translation = #iree_codegen.translation_info -module { - func.func @add_tensor() attributes {translation_info = #translation} { - %cst = arith.constant 0.000000e+00 : f32 - %c64 = arith.constant 64 : index - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<233x1024xf32> - memref.assume_alignment %0, 64 : memref<233x1024xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<233x1024xf32> - memref.assume_alignment %1, 64 : memref<233x1024xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<233x1024xf32> - memref.assume_alignment %2, 64 : memref<233x1024xf32> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %3 = affine.apply #map()[%workgroup_id_x] - %subview = memref.subview %2[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, #map1> - %subview_0 = memref.subview %0[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, #map1> - %subview_1 = memref.subview %1[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, #map1> - scf.forall (%arg0) in (%c64) { - %4 = affine.apply #map2(%arg0) - %subview_2 = memref.subview %subview[0, %4] [1, 4] [1, 1] : memref<1x256xf32, #map1> to memref<1x4xf32, #map1> - %5 = vector.transfer_read %subview_0[%c0, %4], %cst {in_bounds = [true]} : memref<1x256xf32, #map1>, vector<4xf32> - %6 = vector.transfer_read %subview_1[%c0, %4], %cst {in_bounds = [true]} : memref<1x256xf32, #map1>, vector<4xf32> - %7 = arith.addf %5, %6 : vector<4xf32> - vector.transfer_write %7, %subview_2[%c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x4xf32, #map1> - } {mapping = [#gpu.thread]} - return - } +func.func @add_tensor() attributes {translation_info = #translation} { + %cst = arith.constant 0.000000e+00 : f32 + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<233x1024xf32> + memref.assume_alignment %0, 64 : memref<233x1024xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<233x1024xf32> + memref.assume_alignment %1, 64 : memref<233x1024xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<233x1024xf32> + memref.assume_alignment %2, 64 : memref<233x1024xf32> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %3 = affine.apply #map()[%workgroup_id_x] + %subview = memref.subview %2[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, #map1> + %subview_0 = memref.subview %0[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, #map1> + %subview_1 = memref.subview %1[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, #map1> + scf.forall (%arg0) in (%c64) { + %4 = affine.apply #map2(%arg0) + %subview_2 = memref.subview %subview[0, %4] [1, 4] [1, 1] : memref<1x256xf32, #map1> to memref<1x4xf32, #map1> + %5 = vector.transfer_read %subview_0[%c0, %4], %cst {in_bounds = [true]} : memref<1x256xf32, #map1>, vector<4xf32> + %6 = vector.transfer_read %subview_1[%c0, %4], %cst {in_bounds = [true]} : memref<1x256xf32, #map1>, vector<4xf32> + %7 = arith.addf %5, %6 : vector<4xf32> + vector.transfer_write %7, %subview_2[%c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x4xf32, #map1> + } {mapping = [#gpu.thread]} + return } // CHECK: #[[$MAP:.*]] = affine_map<(d0) -> (d0 * 4)> @@ -46,37 +51,42 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<()[s0] -> (s0 * 256)> #map1 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)> #map2 = affine_map<(d0) -> (d0 * 4)> #translation = #iree_codegen.translation_info -module { - func.func @add_tensor_lane_id() attributes {translation_info = #translation} { - %cst = arith.constant 0.000000e+00 : f32 - %c64 = arith.constant 64 : index - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<233x1024xf32> - memref.assume_alignment %0, 64 : memref<233x1024xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<233x1024xf32> - memref.assume_alignment %1, 64 : memref<233x1024xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<233x1024xf32> - memref.assume_alignment %2, 64 : memref<233x1024xf32> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %3 = affine.apply #map()[%workgroup_id_x] - %subview = memref.subview %2[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, #map1> - %subview_0 = memref.subview %0[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, #map1> - %subview_1 = memref.subview %1[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, #map1> - scf.forall (%arg0) in (%c64) { - %4 = affine.apply #map2(%arg0) - %subview_2 = memref.subview %subview[0, %4] [1, 4] [1, 1] : memref<1x256xf32, #map1> to memref<1x4xf32, #map1> - %5 = vector.transfer_read %subview_0[%c0, %4], %cst {in_bounds = [true]} : memref<1x256xf32, #map1>, vector<4xf32> - %6 = vector.transfer_read %subview_1[%c0, %4], %cst {in_bounds = [true]} : memref<1x256xf32, #map1>, vector<4xf32> - %7 = arith.addf %5, %6 : vector<4xf32> - vector.transfer_write %7, %subview_2[%c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x4xf32, #map1> - } {mapping = [#iree_gpu.lane_id<0>]} - return - } +func.func @add_tensor_lane_id() attributes {translation_info = #translation} { + %cst = arith.constant 0.000000e+00 : f32 + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<233x1024xf32> + memref.assume_alignment %0, 64 : memref<233x1024xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<233x1024xf32> + memref.assume_alignment %1, 64 : memref<233x1024xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<233x1024xf32> + memref.assume_alignment %2, 64 : memref<233x1024xf32> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %3 = affine.apply #map()[%workgroup_id_x] + %subview = memref.subview %2[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, #map1> + %subview_0 = memref.subview %0[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, #map1> + %subview_1 = memref.subview %1[%workgroup_id_y, %3] [1, 256] [1, 1] : memref<233x1024xf32> to memref<1x256xf32, #map1> + scf.forall (%arg0) in (%c64) { + %4 = affine.apply #map2(%arg0) + %subview_2 = memref.subview %subview[0, %4] [1, 4] [1, 1] : memref<1x256xf32, #map1> to memref<1x4xf32, #map1> + %5 = vector.transfer_read %subview_0[%c0, %4], %cst {in_bounds = [true]} : memref<1x256xf32, #map1>, vector<4xf32> + %6 = vector.transfer_read %subview_1[%c0, %4], %cst {in_bounds = [true]} : memref<1x256xf32, #map1>, vector<4xf32> + %7 = arith.addf %5, %6 : vector<4xf32> + vector.transfer_write %7, %subview_2[%c0, %c0] {in_bounds = [true]} : vector<4xf32>, memref<1x4xf32, #map1> + } {mapping = [#iree_gpu.lane_id<0>]} + return } // CHECK: #[[$MAP:.*]] = affine_map<(d0) -> (d0 * 4)> diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_pipeline.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_pipeline.mlir index eef9c182fdefc..3535a86173578 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_pipeline.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_pipeline.mlir @@ -2,7 +2,13 @@ // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-pipelining{epilogue-peeling=false}))" --split-input-file %s | FileCheck %s // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-pipelining{pipeline-depth=3 schedule-index=2 epilogue-peeling=false}))" --split-input-file %s | FileCheck -check-prefix=CHECK-NV %s - +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @_matmul_f16_f16_dispatch_0_fill_3456x1024() { %c2048 = arith.constant 2048 : index %c32 = arith.constant 32 : index @@ -15,11 +21,11 @@ func.func @_matmul_f16_f16_dispatch_0_fill_3456x1024() { %3 = gpu.thread_id z %4 = memref.alloc() : memref<4x32x40xf16, 3> %5 = memref.alloc() : memref<4x32x40xf16, 3> - %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<3456x2048xf16> + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<3456x2048xf16> memref.assume_alignment %6, 64 : memref<3456x2048xf16> - %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x1024xf16> + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<2048x1024xf16> memref.assume_alignment %7, 64 : memref<2048x1024xf16> - %8 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<3456x1024xf16> + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<3456x1024xf16> memref.assume_alignment %8, 64 : memref<3456x1024xf16> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index @@ -57,6 +63,13 @@ func.func @_matmul_f16_f16_dispatch_0_fill_3456x1024() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @nvidia_tenscore_schedule_f16() { %c3 = arith.constant 3 : index %c31 = arith.constant 31 : index @@ -73,11 +86,11 @@ func.func @nvidia_tenscore_schedule_f16() { %alloc = memref.alloc() : memref<128x256xf16, #gpu.address_space> %alloc_1 = memref.alloc() : memref<3x128x32xf16, #gpu.address_space> %alloc_2 = memref.alloc() : memref<3x32x256xf16, #gpu.address_space> - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x1280xf16> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<512x1280xf16> memref.assume_alignment %3, 64 : memref<512x1280xf16> - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1280x1280xf16> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<1280x1280xf16> memref.assume_alignment %4, 64 : memref<1280x1280xf16> - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<512x1280xf16> + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<512x1280xf16> memref.assume_alignment %5, 64 : memref<512x1280xf16> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index @@ -503,6 +516,14 @@ func.func @nvidia_tenscore_schedule_f16() { // CHECK-NV: vector.store // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @nvidia_tenscore_schedule_f32() { %c31 = arith.constant 31 : index %c2 = arith.constant 2 : index @@ -519,11 +540,11 @@ func.func @nvidia_tenscore_schedule_f32() { %alloc = memref.alloc() : memref<128x128xf32, #gpu.address_space> %alloc_2 = memref.alloc() : memref<3x128x32xf32, #gpu.address_space> %alloc_3 = memref.alloc() : memref<3x32x128xf32, #gpu.address_space> - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf32> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf32> memref.assume_alignment %3, 64 : memref<256x256xf32> - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf32> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<256x256xf32> memref.assume_alignment %4, 64 : memref<256x256xf32> - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<256x256xf32> + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<256x256xf32> memref.assume_alignment %5, 64 : memref<256x256xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index @@ -1345,5 +1366,3 @@ func.func @nvidia_tenscore_schedule_f32() { // CHECK-NV-COUNT-32: nvgpu.mma.sync // CHECK-NV: } // CHECK-NV: vector.store - -// ----- diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups.mlir index 906ed23f162b4..9377136cd8bd4 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups.mlir @@ -4,13 +4,20 @@ // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=transpose}))" \ // RUN: --split-input-file %s | FileCheck --check-prefix=TRANSPOSE %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @matmul() { %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index %c96 = arith.constant 96 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = tensor.empty() : tensor<128x96xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups_static.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups_static.mlir index 640158da79e26..04ffb5b6427bd 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups_static.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups_static.mlir @@ -33,9 +33,15 @@ // TRANSPOSE-DAG: affine.apply #{{.+}}()[%[[REM]]] // TRANSPOSE: return +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable private @main_dispatch_0 { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @main_dispatch_0_matmul_transpose_b_32000x32000x4096_f16 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>], subgroup_size = 64 : index, translation_info = #iree_codegen.translation_info, workgroup_size = [64 : index, 16 : index, 1 : index]} { + hal.executable.export public @main_dispatch_0_matmul_transpose_b_32000x32000x4096_f16 ordinal(0) layout(#pipeline_layout) attributes {subgroup_size = 64 : index, translation_info = #iree_codegen.translation_info, workgroup_size = [64 : index, 16 : index, 1 : index]} { ^bb0(%arg0: !hal.device): %c250 = arith.constant 250 : index %c500 = arith.constant 500 : index @@ -48,8 +54,8 @@ hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { %c64 = arith.constant 64 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tensor_alloc.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tensor_alloc.mlir index d67493a89fb59..4a681e3610c86 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tensor_alloc.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tensor_alloc.mlir @@ -1,12 +1,19 @@ // RUN: iree-opt %s --allow-unregistered-dialect --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-tensor-tile-to-serial-loops,iree-codegen-gpu-tensor-alloc))" | FileCheck %s // RUN: iree-opt %s --allow-unregistered-dialect --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-tensor-tile-to-serial-loops{coalesce-loops},iree-codegen-gpu-tensor-alloc))" | FileCheck %s --check-prefix=COALESCE_LOOPS +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @matmul_2048x512x1024() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %3 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y] @@ -31,12 +38,19 @@ func.func @matmul_2048x512x1024() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @matmul_1x384x384() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 384], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x384xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x] @@ -54,12 +68,19 @@ func.func @matmul_1x384x384() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @matmul_multi_uses() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %3 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y] @@ -86,44 +107,51 @@ func.func @matmul_multi_uses() { // ----- - func.func @matmul_33x33x903168_f32() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %cst_0 = arith.constant 9.031680e+05 : f32 - %cst_1 = arith.constant 0.949999988 : f32 - %c32 = arith.constant 32 : index - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %0 = affine.min affine_map<()[s0] -> (s0 * -32 + 33, 32)>()[%workgroup_id_x] - %1 = arith.cmpi eq, %0, %c32 : index - scf.if %1 { - %2 = hal.interface.constant.load[0] : i32 - %3 = hal.interface.constant.load[1] : i32 - %4 = hal.interface.constant.load[2] : i32 - %5 = arith.index_castui %2 {stream.alignment = 4096 : index, stream.values = [1240289280 : index, 1789415424 : index]} : i32 to index - %6 = arith.index_castui %3 {stream.alignment = 8192 : index, stream.values = [633077760 : index, 752295936 : index]} : i32 to index - %7 = arith.index_castui %4 {stream.alignment = 64 : index, stream.values = [1486349952 : index, 1486358464 : index]} : i32 to index - %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor> - %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor> - %10 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %11 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%7) : !flow.dispatch.tensor> - %12 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] - %13 = flow.dispatch.tensor.load %11, offsets = [%12, 0], sizes = [32, 33], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x33xf32> - %14 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [903168, 33], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<903168x33xf32> - %15 = flow.dispatch.tensor.load %10, offsets = [%12, 0], sizes = [32, 33], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x33xf32> - %16 = flow.dispatch.tensor.load %8, offsets = [%12, 0], sizes = [32, 903168], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x903168xf32> - %17 = linalg.fill {lowering_config = #iree_codegen.lowering_config} ins(%cst : f32) outs(%13 : tensor<32x33xf32>) -> tensor<32x33xf32> - %18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config} ins(%16, %14 : tensor<32x903168xf32>, tensor<903168x33xf32>) outs(%17 : tensor<32x33xf32>) -> tensor<32x33xf32> - %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<32x33xf32>) outs(%18 : tensor<32x33xf32>) attrs = {lowering_config = #iree_codegen.lowering_config} { - ^bb0(%in: f32, %out: f32): - %20 = arith.divf %out, %cst_0 : f32 - %21 = arith.mulf %in, %cst_1 : f32 - %22 = arith.addf %21, %20 : f32 - linalg.yield %22 : f32 - } -> tensor<32x33xf32> - flow.dispatch.tensor.store %19, %11, offsets = [%12, 0], sizes = [32, 33], strides = [1, 1] : tensor<32x33xf32> -> !flow.dispatch.tensor> - } - return +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_33x33x903168_f32() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 9.031680e+05 : f32 + %cst_1 = arith.constant 0.949999988 : f32 + %c32 = arith.constant 32 : index + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %0 = affine.min affine_map<()[s0] -> (s0 * -32 + 33, 32)>()[%workgroup_id_x] + %1 = arith.cmpi eq, %0, %c32 : index + scf.if %1 { + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %5 = arith.index_castui %2 {stream.alignment = 4096 : index, stream.values = [1240289280 : index, 1789415424 : index]} : i32 to index + %6 = arith.index_castui %3 {stream.alignment = 8192 : index, stream.values = [633077760 : index, 752295936 : index]} : i32 to index + %7 = arith.index_castui %4 {stream.alignment = 64 : index, stream.values = [1486349952 : index, 1486358464 : index]} : i32 to index + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor> + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor> + %10 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%7) : !flow.dispatch.tensor> + %12 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] + %13 = flow.dispatch.tensor.load %11, offsets = [%12, 0], sizes = [32, 33], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x33xf32> + %14 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [903168, 33], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<903168x33xf32> + %15 = flow.dispatch.tensor.load %10, offsets = [%12, 0], sizes = [32, 33], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x33xf32> + %16 = flow.dispatch.tensor.load %8, offsets = [%12, 0], sizes = [32, 903168], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x903168xf32> + %17 = linalg.fill {lowering_config = #iree_codegen.lowering_config} ins(%cst : f32) outs(%13 : tensor<32x33xf32>) -> tensor<32x33xf32> + %18 = linalg.matmul {lowering_config = #iree_codegen.lowering_config} ins(%16, %14 : tensor<32x903168xf32>, tensor<903168x33xf32>) outs(%17 : tensor<32x33xf32>) -> tensor<32x33xf32> + %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%15 : tensor<32x33xf32>) outs(%18 : tensor<32x33xf32>) attrs = {lowering_config = #iree_codegen.lowering_config} { + ^bb0(%in: f32, %out: f32): + %20 = arith.divf %out, %cst_0 : f32 + %21 = arith.mulf %in, %cst_1 : f32 + %22 = arith.addf %21, %20 : f32 + linalg.yield %22 : f32 + } -> tensor<32x33xf32> + flow.dispatch.tensor.store %19, %11, offsets = [%12, 0], sizes = [32, 33], strides = [1, 1] : tensor<32x33xf32> -> !flow.dispatch.tensor> } + return +} // The allocation should not happen when there is any unaligned size, e.g., 33 in this case. // @@ -132,14 +160,23 @@ func.func @matmul_multi_uses() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer> + ]> +]> func.func @weight_dequant_matmul() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(5) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_y = hal.interface.workgroup.id[1] : index %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y] %workgroup_id_x = hal.interface.workgroup.id[0] : index @@ -198,12 +235,19 @@ func.func @weight_dequant_matmul() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @conv() attributes {translation_info = #iree_codegen.translation_info, subgroup_m_count = 1, subgroup_n_count = 4>}>} { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_z = hal.interface.workgroup.id[2] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_id_x = hal.interface.workgroup.id[0] : index diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tensor_tile.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tensor_tile.mlir index 6527e39bc5834..364c25bb347d4 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tensor_tile.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tensor_tile.mlir @@ -1,5 +1,12 @@ // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-tensor-tile, cse))" %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 256)> #map1 = affine_map<(d0, d1) -> (d0, d1)> @@ -7,9 +14,9 @@ module { func.func @add_tensor() attributes {translation_info = #translation} { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %3 = affine.apply #map()[%workgroup_id_x] @@ -28,9 +35,9 @@ module { // CHECK: #[[$MAP:.*]] = affine_map<(d0) -> (d0 * 4)> // CHECK-LABEL: func.func @add_tensor -// CHECK-DAG: %[[A:.*]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[B:.*]] = hal.interface.binding.subspan set(0) binding(1) -// CHECK-DAG: %[[C:.*]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK-DAG: %[[A:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[B:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[C:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-DAG: %[[LA:.*]] = flow.dispatch.tensor.load %[[A]] // CHECK-DAG: %[[LB:.*]] = flow.dispatch.tensor.load %[[B]] // CHECK-DAG: %[[LC:.*]] = flow.dispatch.tensor.load %[[C]] @@ -51,6 +58,12 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 64)> #map1 = affine_map<(d0, d1) -> (d0, d1)> @@ -60,8 +73,8 @@ module { func.func @reduction() attributes {translation_info = #translation} { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %2 = affine.apply #map()[%workgroup_id_x] %3 = flow.dispatch.tensor.load %1, offsets = [%2], sizes = [64], strides = [1] : !flow.dispatch.tensor> -> tensor<64xf32> @@ -103,6 +116,12 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 64)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> @@ -112,8 +131,8 @@ module { func.func @reduction_broadcast() attributes {translation_info = #translation} { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %2 = affine.apply #map()[%workgroup_id_x] diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tile.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tile.mlir index 097c1d0272870..9cc4a19ff6ed2 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tile.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tile.mlir @@ -1,18 +1,24 @@ // RUN: iree-opt -split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-tile))" %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @innermost_reduction() { %c1 = arith.constant 1 : index %c128 = arith.constant 128 : index %cst = arith.constant -0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_cast %0 {stream.alignment = 512 : index, stream.values = [0 : index, 394752 : index, 984064 : index]} : i32 to index %4 = arith.index_cast %1 {stream.alignment = 512 : index, stream.values = [0 : index, 196608 : index, 197120 : index]} : i32 to index %5 = arith.index_cast %2 {stream.alignment = 512 : index, stream.values = [512 : index, 197120 : index, 197632 : index]} : i32 to index - %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%3) : !flow.dispatch.tensor> - %7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) : !flow.dispatch.tensor> - %8 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) : !flow.dispatch.tensor> + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%3) : !flow.dispatch.tensor> + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%4) : !flow.dispatch.tensor> + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%5) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %9 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x] @@ -55,6 +61,12 @@ func.func @innermost_reduction() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @has_scf_if() { %c49152 = arith.constant 49152 : index %c0 = arith.constant 0 : index @@ -62,8 +74,8 @@ func.func @has_scf_if() { %c1023_i32 = arith.constant 1023 : i32 %c2_i32 = arith.constant 2 : i32 %c0_i32 = arith.constant 0 : i32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %2 = affine.apply affine_map<()[s0] -> (s0 * 256)>()[%workgroup_id_x] diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tile_reduction.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tile_reduction.mlir index e49d6aab3c624..2cea3e2a678ec 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tile_reduction.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tile_reduction.mlir @@ -1,9 +1,15 @@ // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-tile-reduction),canonicalize,cse)" --split-input-file %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @warp_reduction_dispatch() { %cst = arith.constant 1.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %2 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_x], sizes = [1], strides = [1] : !flow.dispatch.tensor> -> tensor<1xf32> %3 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_x, 0], sizes = [1, 10240], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x10240xf32> @@ -43,11 +49,18 @@ func.func @warp_reduction_dispatch() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @warp_reduction_batch_matmul() { %cst = arith.constant 1.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %workgroup_id_z = hal.interface.workgroup.id[2] : index @@ -82,10 +95,16 @@ func.func @warp_reduction_batch_matmul() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @warp_reduction_broadcast_dispatch() { %cst = arith.constant 1.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %2 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_x, 0], sizes = [1, 10240], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x10240xf32> %3 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_x, 0], sizes = [1, 10240], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x10240xf32> @@ -144,13 +163,22 @@ func.func @warp_reduction_broadcast_dispatch() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer> + ]> +]> func.func @warp_reduction_multi_reduction() { %cst = arith.constant 0.000000e+00 : f32 - %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %12 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %13 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor> - %14 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor> + %10 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %12 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %13 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor> + %14 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %15 = flow.dispatch.tensor.load %14, offsets = [%workgroup_id_x], sizes = [1], strides = [1] : !flow.dispatch.tensor> -> tensor<1xf32> %16 = flow.dispatch.tensor.load %13, offsets = [0, 0], sizes = [86, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<86x128xf32> @@ -181,7 +209,7 @@ func.func @warp_reduction_multi_reduction() { return } -// CHECk-LABEL: func.func @warp_reduction_multi_reduction() +// CHECK-LABEL: func.func @warp_reduction_multi_reduction() // CHECK: %[[FILL:.+]] = linalg.fill {{.+}} -> tensor<1x2x64xf32> diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_workgroup_swizzle.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_workgroup_swizzle.mlir index 2faaef704b757..bb3565f079d33 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_workgroup_swizzle.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/transform_gpu_workgroup_swizzle.mlir @@ -1,12 +1,19 @@ // RUN: iree-opt %s --iree-transform-dialect-interpreter -transform-dialect-drop-schedule --split-input-file | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @matmul() { %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index %c96 = arith.constant 96 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = tensor.empty() : tensor<128x96xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir index 4a4c310a6d253..a50b6ec01dc6c 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir @@ -1,6 +1,12 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_60 --pass-pipeline='builtin.module(func.func(iree-codegen-vector-reduction-to-gpu, cse))' %s | FileCheck %s // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 --pass-pipeline='builtin.module(func.func(iree-codegen-vector-reduction-to-gpu, cse))' %s | FileCheck %s --check-prefix=CDNA3 +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<()[s0, s1] -> (s1 * 2 + s0 floordiv 32)> #translation_info = #iree_codegen.translation_info module { @@ -11,8 +17,8 @@ module { %cst_1 = arith.constant dense<3.840000e+02> : vector<1xf32> %c32 = arith.constant 32 : index %c384 = arith.constant 384 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<128x384xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<128xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<128x384xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<128xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %thread_id_x = gpu.thread_id x %2 = affine.apply #map()[%thread_id_x, %workgroup_id_x] @@ -69,6 +75,13 @@ module { // Make sure memref.load from uniform buffers are hoisted out as uniform code. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, uniform_buffer> + ]> +]> #translation_info = #iree_codegen.translation_info #map = affine_map<()[s0, s1] -> (s1 * 2 + s0 floordiv 32)> module { @@ -80,14 +93,14 @@ module { %cst_1 = arith.constant dense<3.840000e+02> : vector<1xf32> %c32 = arith.constant 32 : index %c384 = arith.constant 384 : index - %0 = hal.interface.binding.subspan set(0) binding(2) type(uniform_buffer) offset(%c0) : memref<1xvector<4xi32>, #hal.descriptor_type> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) offset(%c0) : memref<1xvector<4xi32>, #hal.descriptor_type> %1 = memref.load %0[%c0] : memref<1xvector<4xi32>, #hal.descriptor_type> %2 = vector.extractelement %1[%c0 : index] : vector<4xi32> %3 = vector.extractelement %1[%c1 : index] : vector<4xi32> %4 = arith.index_castui %2 : i32 to index %5 = arith.index_castui %3 : i32 to index - %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) : memref<128x384xf32> - %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) : memref<128xf32> + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%4) : memref<128x384xf32> + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%5) : memref<128xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %thread_id_x = gpu.thread_id x %8 = affine.apply #map()[%thread_id_x, %workgroup_id_x] @@ -106,14 +119,14 @@ module { // CHECK-LABEL: func.func @reduce_uniform_buffer_offset() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(2) type(uniform_buffer) +// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK: %[[LOAD:.+]] = memref.load %[[SUBSPAN]][%[[C0]]] // CHECK: %[[EXT0:.+]] = vector.extractelement %[[LOAD]][%[[C0]] : index] : vector<4xi32> // CHECK: %[[EXT1:.+]] = vector.extractelement %[[LOAD]][%[[C1]] : index] : vector<4xi32> // CHECK: %[[OFFSET0:.+]] = arith.index_castui %[[EXT0]] : i32 to index // CHECK: %[[OFFSET1:.+]] = arith.index_castui %[[EXT1]] : i32 to index -// CHECK: hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%[[OFFSET0]]) -// CHECK: hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%[[OFFSET1]]) +// CHECK: hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%[[OFFSET0]]) +// CHECK: hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%[[OFFSET1]]) // CHECK: scf.for // CHECK-COUNT-5: gpu.shuffle // CHECK: arith.addf @@ -123,7 +136,13 @@ module { // Make sure memref.load from readonly storage buffers are hoisted out as uniform code. - +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<()[s0, s1] -> (s1 * 2 + s0 floordiv 32)> #translation_info = #iree_codegen.translation_info module { @@ -135,14 +154,14 @@ module { %cst_1 = arith.constant dense<3.840000e+02> : vector<1xf32> %c32 = arith.constant 32 : index %c384 = arith.constant 384 : index - %0 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1xvector<4xi32>, #hal.descriptor_type> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : memref<1xvector<4xi32>, #hal.descriptor_type> %1 = memref.load %0[%c0] : memref<1xvector<4xi32>, #hal.descriptor_type> %2 = vector.extractelement %1[%c0 : index] : vector<4xi32> %3 = vector.extractelement %1[%c1 : index] : vector<4xi32> %4 = arith.index_castui %2 : i32 to index %5 = arith.index_castui %3 : i32 to index - %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) : memref<128x384xf32> - %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) : memref<128xf32> + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%4) : memref<128x384xf32> + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%5) : memref<128xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %thread_id_x = gpu.thread_id x %8 = affine.apply #map()[%thread_id_x, %workgroup_id_x] @@ -161,14 +180,14 @@ module { // CHECK-LABEL: func.func @reduce_storage_buffer_offset() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) +// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK: %[[LOAD:.+]] = memref.load %[[SUBSPAN]][%[[C0]]] // CHECK: %[[EXT0:.+]] = vector.extractelement %[[LOAD]][%[[C0]] : index] : vector<4xi32> // CHECK: %[[EXT1:.+]] = vector.extractelement %[[LOAD]][%[[C1]] : index] : vector<4xi32> // CHECK: %[[OFFSET0:.+]] = arith.index_castui %[[EXT0]] : i32 to index // CHECK: %[[OFFSET1:.+]] = arith.index_castui %[[EXT1]] : i32 to index -// CHECK: hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%[[OFFSET0]]) -// CHECK: hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%[[OFFSET1]]) +// CHECK: hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%[[OFFSET0]]) +// CHECK: hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%[[OFFSET1]]) // CHECK: scf.for // CHECK-COUNT-5: gpu.shuffle // CHECK: arith.addf @@ -176,6 +195,12 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #translation_info = #iree_codegen.translation_info module { func.func @shared_memory_copy() attributes {translation_info = #translation_info} { @@ -183,8 +208,8 @@ module { %cst = arith.constant dense<0.000000e+00> : vector<1xf32> %cst_0 = arith.constant 0.000000e+00 : f32 %c32 = arith.constant 32 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<128x32xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<128x32xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<128x32xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<128x32xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %alloc = memref.alloc() {alignment = 64 : i64} : memref<32xf32, #gpu.address_space> %2 = vector.transfer_read %0[%workgroup_id_x, %c0], %cst_0 {in_bounds = [true]} : memref<128x32xf32>, vector<32xf32> @@ -209,6 +234,13 @@ module { // Check that we multi-row matvec gets distributed across subgroup threads. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #translation_info = #iree_codegen.translation_info #map = affine_map<()[s0] -> (s0 * 4)> #map1 = affine_map<(d0, d1) -> (0, d1)> @@ -221,11 +253,11 @@ module { %c512 = arith.constant 512 : index %cst_1 = arith.constant 0.000000e+00 : f16 %thread_id_x = gpu.thread_id x - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type> memref.assume_alignment %0, 64 : memref<1x4096xf16, #hal.descriptor_type> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type> memref.assume_alignment %1, 64 : memref<32000x4096xf16, #hal.descriptor_type> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type> memref.assume_alignment %2, 64 : memref<1x32000xf16, #hal.descriptor_type> %workgroup_id_x = hal.interface.workgroup.id[0] : index %3 = affine.apply #map()[%workgroup_id_x] @@ -258,13 +290,20 @@ module { // CDNA3-NEXT: return // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #translation_info = #iree_codegen.translation_info module { func.func @simple_nd_write() attributes {translation_info = #translation_info} { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<4x1024xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<4x1024xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<4x1024xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<4x1024xf32> %2 = vector.transfer_read %0[%c0, %c0], %cst {in_bounds = [true, true]} : memref<4x1024xf32>, vector<4x1024xf32> vector.transfer_write %2, %1[%c0, %c0] {in_bounds = [true, true]} : vector<4x1024xf32>, memref<4x1024xf32> return diff --git a/compiler/src/iree/compiler/Codegen/Common/test/bufferize_copy_only_dispatches.mlir b/compiler/src/iree/compiler/Codegen/Common/test/bufferize_copy_only_dispatches.mlir index 1d9a1cb6ae565..cbbcde4115e4b 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/bufferize_copy_only_dispatches.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/bufferize_copy_only_dispatches.mlir @@ -1,45 +1,50 @@ // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-bufferize-copy-only-dispatches))" --split-input-file %s | FileCheck %s -builtin.module { - func.func @tensor_insert_slice() { - %slice_size = hal.interface.constant.load[0] : index - %dest_offset_y = hal.interface.constant.load[1] : index - %dest_offset_x = hal.interface.constant.load[2] : index - %dest_stride_y = hal.interface.constant.load[3] : index - %dest_stride_x = hal.interface.constant.load[4] : index - %source_offset_y = hal.interface.constant.load[5] : index - %source_offset_x = hal.interface.constant.load[6] : index - %source_stride_y = hal.interface.constant.load[7] : index - %source_stride_x = hal.interface.constant.load[8] : index - %dest_binding_size_y = hal.interface.constant.load[9] : index - %dest_binding_size_x = hal.interface.constant.load[10] : index - %source_binding_size_y = hal.interface.constant.load[11] : index - %source_binding_size_x = hal.interface.constant.load[12] : index - %source = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) - : !flow.dispatch.tensor>{%source_binding_size_y, %source_binding_size_x} - %dest = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) - : !flow.dispatch.tensor>{%dest_binding_size_y, %dest_binding_size_x} - %source_load = flow.dispatch.tensor.load %source, offsets = [%source_offset_y, %source_offset_x], - sizes = [1, %slice_size], strides = [%source_stride_y, %source_stride_x] - : !flow.dispatch.tensor>{%source_binding_size_y, %source_binding_size_x} -> tensor - flow.dispatch.tensor.store %source_load, %dest, offsets = [%dest_offset_y, %dest_offset_x], - sizes = [%slice_size, 1], strides = [%dest_stride_y, %dest_stride_x] - : tensor -> !flow.dispatch.tensor>{%dest_binding_size_y, %dest_binding_size_x} - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @tensor_insert_slice() { + %slice_size = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %dest_offset_y = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %dest_offset_x = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %dest_stride_y = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %dest_stride_x = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : index + %source_offset_y = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : index + %source_offset_x = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : index + %source_stride_y = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : index + %source_stride_x = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : index + %dest_binding_size_y = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : index + %dest_binding_size_x = hal.interface.constant.load layout(#pipeline_layout) ordinal(10) : index + %source_binding_size_y = hal.interface.constant.load layout(#pipeline_layout) ordinal(11) : index + %source_binding_size_x = hal.interface.constant.load layout(#pipeline_layout) ordinal(12) : index + %source = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) + : !flow.dispatch.tensor>{%source_binding_size_y, %source_binding_size_x} + %dest = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) + : !flow.dispatch.tensor>{%dest_binding_size_y, %dest_binding_size_x} + %source_load = flow.dispatch.tensor.load %source, offsets = [%source_offset_y, %source_offset_x], + sizes = [1, %slice_size], strides = [%source_stride_y, %source_stride_x] + : !flow.dispatch.tensor>{%source_binding_size_y, %source_binding_size_x} -> tensor + flow.dispatch.tensor.store %source_load, %dest, offsets = [%dest_offset_y, %dest_offset_x], + sizes = [%slice_size, 1], strides = [%dest_stride_y, %dest_stride_x] + : tensor -> !flow.dispatch.tensor>{%dest_binding_size_y, %dest_binding_size_x} + return } + // CHECK: func.func @tensor_insert_slice() -// CHECK-DAG: %[[SLICE_SIZE:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[DEST_OFFSET_Y:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[DEST_OFFSET_X:.+]] = hal.interface.constant.load[2] -// CHECK-DAG: %[[DEST_STRIDE_Y:.+]] = hal.interface.constant.load[3] -// CHECK-DAG: %[[DEST_STRIDE_X:.+]] = hal.interface.constant.load[4] -// CHECK-DAG: %[[SOURCE_OFFSET_Y:.+]] = hal.interface.constant.load[5] -// CHECK-DAG: %[[SOURCE_OFFSET_X:.+]] = hal.interface.constant.load[6] -// CHECK-DAG: %[[SOURCE_STRIDE_Y:.+]] = hal.interface.constant.load[7] -// CHECK-DAG: %[[SOURCE_STRIDE_X:.+]] = hal.interface.constant.load[8] -// CHECK-DAG: %[[SOURCE:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[DEST:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[SLICE_SIZE:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[DEST_OFFSET_Y:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[DEST_OFFSET_X:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) +// CHECK-DAG: %[[DEST_STRIDE_Y:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(3) +// CHECK-DAG: %[[DEST_STRIDE_X:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(4) +// CHECK-DAG: %[[SOURCE_OFFSET_Y:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(5) +// CHECK-DAG: %[[SOURCE_OFFSET_X:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(6) +// CHECK-DAG: %[[SOURCE_STRIDE_Y:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(7) +// CHECK-DAG: %[[SOURCE_STRIDE_X:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(8) +// CHECK-DAG: %[[SOURCE:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[DEST:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][%[[SOURCE_OFFSET_Y]], %[[SOURCE_OFFSET_X]]] [1, %[[SLICE_SIZE]]] [%[[SOURCE_STRIDE_Y]], %[[SOURCE_STRIDE_X]]] // CHECK-DAG: %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][%[[DEST_OFFSET_Y]], %[[DEST_OFFSET_X]]] [%[[SLICE_SIZE]], 1] [%[[DEST_STRIDE_Y]], %[[DEST_STRIDE_X]]] // CHECK: linalg.generic @@ -48,19 +53,24 @@ builtin.module { // ----- -builtin.module { - func.func @UpSampling1D() { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(uniform_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [2, 1, 3], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x3xf32> - flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [2, 1, 3], strides = [1, 1, 1] : tensor<2x3xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, uniform_buffer> + ]> +]> +func.func @UpSampling1D() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [2, 1, 3], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x3xf32> + flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [2, 1, 3], strides = [1, 1, 1] : tensor<2x3xf32> -> !flow.dispatch.tensor> + return } + // CHECK-LABEL: func.func @UpSampling1D() -// CHECK-DAG: %[[DEST:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[SOURCE:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[DEST:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[SOURCE:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[SOURCE_SUBVIEW:.+]] = memref.subview %[[SOURCE]][0, 0, 0] [2, 1, 3] // CHECK-DAG: %[[DEST_SUBVIEW:.+]] = memref.subview %[[DEST]][0, 0, 0] [2, 1, 3] // CHECK: linalg.generic @@ -69,15 +79,19 @@ builtin.module { // ----- -builtin.module { - func.func @concatenate_cst() { - %cst = arith.constant dense<0> : tensor<2x3xi32> - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - flow.dispatch.tensor.store %cst, %0, offsets = [0, 2], sizes = [2, 3], strides = [1, 1] : tensor<2x3xi32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout + ]> +]> +func.func @concatenate_cst() { + %cst = arith.constant dense<0> : tensor<2x3xi32> + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + flow.dispatch.tensor.store %cst, %0, offsets = [0, 2], sizes = [2, 3], strides = [1, 1] : tensor<2x3xi32> -> !flow.dispatch.tensor> + return } + // CHECK-LABEL: func.func @concatenate_cst() // CHECK-DAG: %[[CST:.+]] = arith.constant dense<0> : tensor<2x3xi32> // CHECK-DAG: %[[ZERO:.+]] = bufferization.to_memref %[[CST]] : memref<2x3xi32 @@ -89,21 +103,24 @@ builtin.module { // ----- -module { - func.func @already_bufferized() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<1001xf32, #hal.descriptor_type> - memref.assume_alignment %0, 64 : memref<1001xf32, #hal.descriptor_type> - %alloc = memref.alloc() : memref<1001xf32> - linalg.fill ins(%cst : f32) outs(%alloc : memref<1001xf32>) - linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["reduction"]} ins(%alloc : memref<1001xf32>) outs(%0 : memref<1001xf32, #hal.descriptor_type>) { - ^bb0(%in: f32, %out: f32): - linalg.yield %in : f32 - } - memref.dealloc %alloc : memref<1001xf32> - return +#pipeline_layout = #hal.pipeline.layout + ]> +]> +func.func @already_bufferized() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<1001xf32, #hal.descriptor_type> + memref.assume_alignment %0, 64 : memref<1001xf32, #hal.descriptor_type> + %alloc = memref.alloc() : memref<1001xf32> + linalg.fill ins(%cst : f32) outs(%alloc : memref<1001xf32>) + linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["reduction"]} ins(%alloc : memref<1001xf32>) outs(%0 : memref<1001xf32, #hal.descriptor_type>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 } + memref.dealloc %alloc : memref<1001xf32> + return } // CHECK-LABEL: func.func @already_bufferized diff --git a/compiler/src/iree/compiler/Codegen/Common/test/canonicalize_interface_load_store.mlir b/compiler/src/iree/compiler/Codegen/Common/test/canonicalize_interface_load_store.mlir index 1af2c7d92fe87..023b33afb4858 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/canonicalize_interface_load_store.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/canonicalize_interface_load_store.mlir @@ -1,13 +1,18 @@ // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-cleanup-buffer-alloc-view))" %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout + ]> +]> // CHECK-LABEL: func.func @fold_reshape_load() func.func @fold_reshape_load() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %cst = arith.constant 0.0 : f32 - // CHECK: %[[ARG:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> + // CHECK: %[[ARG:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[ARG]], {{.*}} : !flow.dispatch.tensor> -> tensor<3x3x96xf32> %3 = flow.dispatch.tensor.load %1, offsets=[0, 0, 0, 0], sizes =[3, 3, 1, 96], strides=[1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x1x96xf32> %4 = tensor.collapse_shape %3 [[0, 1, 2, 3]] : tensor<3x3x1x96xf32> into tensor<864xf32> @@ -21,14 +26,19 @@ func.func @fold_reshape_load() { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> // CHECK-LABEL: func.func @fold_reshape_store() func.func @fold_reshape_store() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %cst = arith.constant 0.0 : f32 - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - // CHECK: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + // CHECK: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : !flow.dispatch.tensor> // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %{{.*}}, {{.*}} %3 = flow.dispatch.tensor.load %1, offsets=[0, 0, 0, 0], sizes =[3, 3, 1, 96], strides=[1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x1x96xf32> // CHECK: %[[FILL:.+]] = linalg.fill ins(%{{.+}}) outs(%[[LOAD]] : tensor<3x3x1x96xf32>) @@ -42,14 +52,19 @@ func.func @fold_reshape_store() { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> // CHECK-LABEL: func.func @dont_fold_reshape_with_not_full_load() func.func @dont_fold_reshape_with_not_full_load() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index %c96 = arith.constant 96 : index - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %1, offsets = [%c3, %c0, %c0, %c0], sizes = [%c3, %c3, %c1, %c96], strides = [%c1, %c1, %c1, %c1] : !flow.dispatch.tensor> -> tensor<3x3x1x96xf32> // CHECK: tensor.collapse_shape // CHECK: tensor.expand_shape @@ -61,15 +76,20 @@ func.func @dont_fold_reshape_with_not_full_load() { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> // CHECK-LABEL: func.func @dont_fold_dynamic_reshape() func.func @dont_fold_dynamic_reshape() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index - %dim0 = hal.interface.constant.load[0] : index - %dim1 = hal.interface.constant.load[1] : index - %dim2 = hal.interface.constant.load[2] : index - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%dim0, %dim1} - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%dim2} + %dim0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %dim1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %dim2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%dim0, %dim1} + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%dim2} %3 = flow.dispatch.tensor.load %1, offsets=[0, 0, 0], sizes =[%dim0, %dim1, 96], strides=[1, 1, 1] : !flow.dispatch.tensor>{%dim0, %dim1} -> tensor // CHECK: tensor.collapse_shape // CHECK: tensor.expand_shape @@ -82,6 +102,11 @@ func.func @dont_fold_dynamic_reshape() { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> // CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 288)> // CHECK-LABEL: func.func @fold_reshape_slice_store func.func @fold_reshape_slice_store(%x: index) { @@ -89,9 +114,9 @@ func.func @fold_reshape_slice_store(%x: index) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %cst = arith.constant 0.0 : f32 - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - // CHECK: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + // CHECK: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : !flow.dispatch.tensor> // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %{{.*}}, {{.*}} %3 = flow.dispatch.tensor.load %1, offsets=[0, 0, 0, 0], sizes =[3, 3, 1, 96], strides=[1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x1x96xf32> // CHECK: %[[FILL:.+]] = linalg.fill ins(%{{.+}}) outs(%[[LOAD]] : tensor<3x3x1x96xf32>) diff --git a/compiler/src/iree/compiler/Codegen/Common/test/convert_bf16_to_uint16_buffers.mlir b/compiler/src/iree/compiler/Codegen/Common/test/convert_bf16_to_uint16_buffers.mlir index 1df5d798014af..399d509e32d1e 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/convert_bf16_to_uint16_buffers.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/convert_bf16_to_uint16_buffers.mlir @@ -1,20 +1,27 @@ // RUN: iree-opt --split-input-file \ // RUN: --iree-convert-bf16-to-uint16-buffers %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> // CHECK-LABEL: @bf16_conversion func.func @bf16_conversion() { %c0 = arith.constant 0 : index %c8 = arith.constant 8 : index - // CHECK-DAG: %[[BUF0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref>{%c8} - // CHECK-DAG: %[[BUF1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref>{%c8} - // CHECK-DAG: %[[BUF2:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref>{%c8} + // CHECK-DAG: %[[BUF0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref>{%c8} + // CHECK-DAG: %[[BUF1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref>{%c8} + // CHECK-DAG: %[[BUF2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) alignment(64) offset(%c0) : memref>{%c8} // CHECK-DAG: %[[LOAD0:.+]] = memref.load %[[BUF0]][%arg0] : memref> // CHECK-DAG: %[[LOAD1:.+]] = memref.load %[[BUF1]][%arg0] : memref> // CHECK: memref.store %{{.+}}, %[[BUF2]][%arg0] : memref> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref>{%c8} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref>{%c8} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref>{%c8} + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref>{%c8} + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref>{%c8} + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref>{%c8} %3 = gpu.thread_id x %4 = gpu.block_dim x scf.for %arg0 = %3 to %c8 step %4 { @@ -41,6 +48,13 @@ func.func @bf16_constant(%arg0 : bf16) -> bf16 { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> + // CHECK-LABEL: @iree_uk_mmt4d // CHECK-SAME: memref // CHECK-SAME: memref @@ -63,11 +77,11 @@ func.func @mmt4d_bf16xbf16xf32() { %c0 = arith.constant 0 : index %c64 = arith.constant 64 : index %c128 = arith.constant 128 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x3x8x1xbf16> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x3x8x1xbf16> memref.assume_alignment %0, 64 : memref<1x3x8x1xbf16> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c64) flags(ReadOnly) : memref<1x3x8x1xbf16, strided<[24, 8, 1, 1], offset: 32>> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c64) flags(ReadOnly) : memref<1x3x8x1xbf16, strided<[24, 8, 1, 1], offset: 32>> memref.assume_alignment %1, 64 : memref<1x3x8x1xbf16, strided<[24, 8, 1, 1], offset: 32>> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c128) : memref<1x1x8x8xf32, strided<[64, 64, 8, 1], offset: 32>> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c128) : memref<1x1x8x8xf32, strided<[64, 64, 8, 1], offset: 32>> memref.assume_alignment %2, 64 : memref<1x1x8x8xf32, strided<[64, 64, 8, 1], offset: 32>> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index @@ -116,14 +130,21 @@ func.func @load_trunc_f32_bf16(%arg0 : memref<32xf32>, %arg1 : memref<32xbf16>) // Test that iree_codegen.extract_strided_metadata (or any other op from iree_codegen) // is rewritten correctly, along with any following ops. // See issue https://github.com/iree-org/iree/issues/17177 + +#pipeline_layout = #hal.pipeline.layout + ]> +]> + // CHECK-LABEL: module @extract_strided_metadata module @extract_strided_metadata { func.func private @external_func(memref, index) attributes {llvm.bareptr = [true]} // CHECK: func.func private @external_func(memref, index) func.func @external_func_entry_point() attributes {translation_info = #iree_codegen.translation_info} { - %0 = hal.interface.constant.load[0] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = arith.index_castui %0 : i32 to index - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%1) flags(ReadOnly) : memref<1x8x768xbf16, strided<[6144, 768, 1], offset: ?>> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%1) flags(ReadOnly) : memref<1x8x768xbf16, strided<[6144, 768, 1], offset: ?>> // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan {{.*}} : memref<1x8x768xi16, %base_buffer, %offset, %sizes:3, %strides:3 = iree_codegen.extract_strided_metadata %2 : memref<1x8x768xbf16, strided<[6144, 768, 1], offset: ?>> -> memref, index, index, index, index, index, index, index // CHECK: {{.+}} = iree_codegen.extract_strided_metadata %[[SUBSPAN]] : memref<1x8x768xi16, diff --git a/compiler/src/iree/compiler/Codegen/Common/test/convert_to_destination_passing_style.mlir b/compiler/src/iree/compiler/Codegen/Common/test/convert_to_destination_passing_style.mlir index c075e09767f7c..1101469a0fb3a 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/convert_to_destination_passing_style.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/convert_to_destination_passing_style.mlir @@ -1,13 +1,21 @@ // RUN: iree-opt %s --pass-pipeline="builtin.module(func.func(iree-codegen-convert-to-destination-passing-style),canonicalize,cse)" --split-input-file | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> func.func @matmul() { - %m = hal.interface.constant.load[0] : index - %n = hal.interface.constant.load[1] : index - %k = hal.interface.constant.load[2] : index - %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%m, %k} - %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%k, %n} - %init = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%m, %n} - %result = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor>{%m, %n} + %m = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %n = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %k = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %lhs = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%m, %k} + %rhs = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%k, %n} + %init = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%m, %n} + %result = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%m, %n} %wg_id_y = hal.interface.workgroup.id[1] : index %wg_count_y = hal.interface.workgroup.count[1] : index %wg_size_y = hal.interface.workgroup.size[1] : index @@ -32,10 +40,10 @@ func.func @matmul() { return } // CHECK: func.func @matmul() -// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[INIT:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) -// CHECK-DAG: %[[RESULT:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) +// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[INIT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) +// CHECK-DAG: %[[RESULT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) // CHECK: scf.for %[[IV0:.+]] = // CHECK: scf.for %[[IV1:.+]] = // CHECK-DAG: %[[LHS_TILE:.+]] = flow.dispatch.tensor.load %[[LHS]] @@ -48,15 +56,22 @@ func.func @matmul() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @matmul_fill() { %cst = arith.constant 0.0 : f32 %c0 = arith.constant 0 : index - %m = hal.interface.constant.load[0] : index - %n = hal.interface.constant.load[1] : index - %k = hal.interface.constant.load[2] : index - %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%m, %k} - %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%k, %n} - %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%m, %n} + %m = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %n = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %k = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %lhs = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%m, %k} + %rhs = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%k, %n} + %result = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%m, %n} %wg_id_y = hal.interface.workgroup.id[1] : index %wg_count_y = hal.interface.workgroup.count[1] : index %wg_size_y = hal.interface.workgroup.size[1] : index @@ -82,9 +97,9 @@ func.func @matmul_fill() { return } // CHECK: func.func @matmul_fill() -// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[RESULT:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) +// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[RESULT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK: scf.for %[[IV0:.+]] = // CHECK: scf.for %[[IV1:.+]] = // CHECK-DAG: %[[LHS_TILE:.+]] = flow.dispatch.tensor.load %[[LHS]] @@ -99,14 +114,21 @@ func.func @matmul_fill() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @matmul_inplace() { %c0 = arith.constant 0 : index - %m = hal.interface.constant.load[0] : index - %n = hal.interface.constant.load[1] : index - %k = hal.interface.constant.load[2] : index - %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%m, %k} - %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%k, %n} - %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%m, %n} + %m = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %n = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %k = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %lhs = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%m, %k} + %rhs = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%k, %n} + %result = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%m, %n} %wg_id_y = hal.interface.workgroup.id[1] : index %wg_count_y = hal.interface.workgroup.count[1] : index %wg_size_y = hal.interface.workgroup.size[1] : index @@ -131,9 +153,9 @@ func.func @matmul_inplace() { return } // CHECK: func.func @matmul_inplace() -// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[RESULT:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) +// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[RESULT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK: scf.for %[[IV0:.+]] = // CHECK: scf.for %[[IV1:.+]] = // CHECK-DAG: %[[LHS_TILE:.+]] = flow.dispatch.tensor.load %[[LHS]] @@ -146,36 +168,48 @@ func.func @matmul_inplace() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @reshape_simple() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index %c4 = arith.constant 4 : index %c12 = arith.constant 12 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [12], strides = [1] : !flow.dispatch.tensor> -> tensor<12xi32> %3 = tensor.expand_shape %2 [[0, 1]] output_shape [3, 4] : tensor<12xi32> into tensor<3x4xi32> flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [3, 4], strides = [1, 1] : tensor<3x4xi32> -> !flow.dispatch.tensor> return } // CHECK: func.func @reshape_simple() -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) +// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK: %[[SOURCE:.+]] = flow.dispatch.tensor.load %[[ARG0]] // CHECK: %[[RESHAPE:.+]] = tensor.expand_shape %[[SOURCE]] // CHECK: flow.dispatch.tensor.store %[[RESHAPE]], %[[RET0]] // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @reshape_fused_source() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index %c4 = arith.constant 4 : index %c12 = arith.constant 12 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [12], strides = [1] : !flow.dispatch.tensor> -> tensor<12xi32> %3 = tensor.expand_shape %2 [[0, 1]] output_shape [3, 4] : tensor<12xi32> into tensor<3x4xi32> %4 = tensor.empty() : tensor<3x4xi32> @@ -191,8 +225,8 @@ func.func @reshape_fused_source() { return } // CHECK: func.func @reshape_fused_source() -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) +// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK: %[[TARGET:.+]] = flow.dispatch.tensor.load %[[RET0]] // CHECK: %[[SOURCE:.+]] = flow.dispatch.tensor.load %[[ARG0]] // CHECK: %[[RESHAPE:.+]] = tensor.expand_shape %[[SOURCE]] @@ -203,15 +237,22 @@ func.func @reshape_fused_source() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @reshape_fused_source_and_copyout() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index %c4 = arith.constant 4 : index %c12 = arith.constant 12 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [12], strides = [1] : !flow.dispatch.tensor> -> tensor<12xi32> %4 = tensor.expand_shape %3 [[0, 1]] output_shape [3, 4] : tensor<12xi32> into tensor<3x4xi32> %5 = tensor.empty() : tensor<3x4xi32> @@ -228,9 +269,9 @@ func.func @reshape_fused_source_and_copyout() { return } // CHECK: func.func @reshape_fused_source_and_copyout() -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[RET1:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) +// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[RET1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-DAG: %[[TARGET:.+]] = flow.dispatch.tensor.load %[[RET0]] // CHECK: %[[SOURCE:.+]] = flow.dispatch.tensor.load %[[ARG0]] // CHECK: %[[RESHAPE:.+]] = tensor.expand_shape %[[SOURCE]] @@ -242,14 +283,20 @@ func.func @reshape_fused_source_and_copyout() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @reshape_fused_target() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index %c4 = arith.constant 4 : index %c12 = arith.constant 12 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [3, 4], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<3x4xi32> %3 = tensor.empty() : tensor<3x4xi32> %4 = linalg.generic { @@ -265,8 +312,8 @@ func.func @reshape_fused_target() { return } // CHECK: func.func @reshape_fused_target() -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) +// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[SOURCE:.+]] = flow.dispatch.tensor.load %[[ARG0]] // CHECK-DAG: %[[TARGET:.+]] = flow.dispatch.tensor.load %[[RET0]] // CHECK-DAG: %[[RESHAPE_EXPAND:.+]] = tensor.expand_shape %[[TARGET]] {{\[}}[0, 1]{{\]}} @@ -278,6 +325,13 @@ func.func @reshape_fused_target() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @cast_followed_by_store() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 @@ -285,9 +339,9 @@ func.func @cast_followed_by_store() { %c64 = arith.constant 64 : index %c1 = arith.constant 1 : index %c32 = arith.constant 32 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index @@ -314,9 +368,9 @@ func.func @cast_followed_by_store() { return } // CHECK: func.func @cast_followed_by_store() -// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[RESULT:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) +// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[RESULT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK: scf.for %[[IV0:.+]] = // CHECK: scf.for %[[IV1:.+]] = // CHECK-DAG: %[[LHS_TILE:.+]] = flow.dispatch.tensor.load %[[LHS]] @@ -331,6 +385,14 @@ func.func @cast_followed_by_store() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map = affine_map<(d0, d1) -> (d0, d1)> func.func @multi_result() { %c0 = arith.constant 0 : index @@ -338,22 +400,22 @@ func.func @multi_result() { %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index - %dim0 = hal.interface.constant.load[0] : index - %dim1 = hal.interface.constant.load[1] : index - %dim2 = hal.interface.constant.load[2] : index - %dim3 = hal.interface.constant.load[3] : index - %dim4 = hal.interface.constant.load[4] : index - %dim5 = hal.interface.constant.load[5] : index - %dim6 = hal.interface.constant.load[6] : index - %dim7 = hal.interface.constant.load[7] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%dim0, %dim1} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%dim2, %dim3} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%dim4, %dim5} - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor>{%dim6, %dim7} - %4 = hal.interface.constant.load[8] : index - %5 = hal.interface.constant.load[9] : index - %6 = hal.interface.constant.load[10] : index - %7 = hal.interface.constant.load[11] : index + %dim0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %dim1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %dim2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %dim3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %dim4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : index + %dim5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : index + %dim6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : index + %dim7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%dim0, %dim1} + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%dim2, %dim3} + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%dim4, %dim5} + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%dim6, %dim7} + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : index + %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : index + %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(10) : index + %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(11) : index %8 = hal.interface.workgroup.id[0] : index %9 = hal.interface.workgroup.id[1] : index %10 = hal.interface.workgroup.count[0] : index @@ -386,10 +448,10 @@ func.func @multi_result() { return } // CHECK: func.func @multi_result() -// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[RESULT0:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) -// CHECK-DAG: %[[RESULT1:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) +// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[RESULT0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) +// CHECK-DAG: %[[RESULT1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) // CHECK: scf.for %[[IV0:.+]] = // CHECK: scf.for %[[IV1:.+]] = // CHECK-DAG: %[[LHS_TILE:.+]] = flow.dispatch.tensor.load %[[LHS]] @@ -404,25 +466,32 @@ func.func @multi_result() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @unused_ins_operand() { %c64 = arith.constant 64 : index %c32 = arith.constant 32 : index %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = hal.interface.constant.load[4] : i32 - %5 = hal.interface.constant.load[5] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 + %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32 %6 = arith.index_cast %0 : i32 to index %7 = arith.index_cast %1 : i32 to index %8 = arith.index_cast %2 : i32 to index %9 = arith.index_cast %3 : i32 to index %10 = arith.index_cast %4 : i32 to index %11 = arith.index_cast %5 : i32 to index - %12 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c32) : !flow.dispatch.tensor>{%6, %7, %8} - %13 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c64) : !flow.dispatch.tensor>{%9, %10, %11} - %14 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%9, %10, %8} + %12 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) offset(%c32) : !flow.dispatch.tensor>{%6, %7, %8} + %13 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) offset(%c64) : !flow.dispatch.tensor>{%9, %10, %11} + %14 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%9, %10, %8} %15 = flow.dispatch.tensor.load %13, offsets = [0, 0, 0], sizes = [%9, %10, %11], strides = [1, 1, 1] : !flow.dispatch.tensor>{%9, %10, %11} -> tensor %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index @@ -462,8 +531,8 @@ func.func @unused_ins_operand() { return } // CHECK-LABEL: func.func @unused_ins_operand() -// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-DAG: %[[IN_VIEW:.+]] = flow.dispatch.tensor.load %[[IN]] // CHECK-DAG: %[[OUT_VIEW:.+]] = flow.dispatch.tensor.load %[[OUT]] // CHECK: linalg.generic @@ -472,11 +541,17 @@ func.func @unused_ins_operand() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @cumsum__2x2x2x2x2x2x2() { %cst = arith.constant dense<0.000000e+00> : tensor<2x2x2x2x2x2x2xf32> %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0, 0, 0, 0], sizes = [3, 2, 2, 2, 2, 2, 2], strides = [1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x2x2x2x2x2x2xf32> %3 = tensor.empty() : tensor<2xf32> %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0 + d7, d1, d2, d3, d4, d5, d6)>, @@ -502,11 +577,17 @@ func.func @cumsum__2x2x2x2x2x2x2() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @reduce_window_max_4x6xf32() { %cst = arith.constant dense<0xFF800000> : tensor<2x2xf32> %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 4, 6], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x4x6xf32> %3 = tensor.empty() : tensor<2x2x3xf32> %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d2, d0 * 2 + d3, d1 * 3 + d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%2, %3 : tensor<2x4x6xf32>, tensor<2x2x3xf32>) outs(%cst : tensor<2x2xf32>) { @@ -526,9 +607,14 @@ func.func @reduce_window_max_4x6xf32() { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @sort1D() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> %1 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor> -> tensor<4xi32> %2 = iree_linalg_ext.sort dimension(0) outs(%1 : tensor<4xi32>) { ^bb0(%arg0: i32, %arg1: i32): @@ -539,7 +625,7 @@ func.func @sort1D() { return } // CHECK: func.func @sort1D() -// CHECK-DAG: %[[BUF:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) +// CHECK-DAG: %[[BUF:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-DAG: %[[IN:.+]] = flow.dispatch.tensor.load %[[BUF]] // CHECK: %[[SORT:.+]] = iree_linalg_ext.sort // CHECK-SAME: outs(%[[IN]] : tensor<4xi32>) @@ -547,12 +633,18 @@ func.func @sort1D() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @clone_index_computations() { %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = arith.index_castui %0 : i32 to index - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%1} - %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%1} + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%1} + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%1} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %4 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] @@ -572,22 +664,31 @@ func.func @clone_index_computations() { return } // CHECK-LABEL: func @clone_index_computations() -// CHECK-DAG: %[[INPUT:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[OUTPUT:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[INPUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[OUTPUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK: scf.for // CHECK: %[[TILESIZE:.+]] = affine.min // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[OUTPUT]], offsets = [{{.+}}], sizes = [%[[TILESIZE]]] // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<5, storage_buffer> + ]> +]> func.func @gemm_gather() { %c0 = arith.constant 0 : index %cst = arith.constant 0.0 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor> - %result = hal.interface.binding.subspan set(0) binding(5) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor> + %result = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(5) : !flow.dispatch.tensor> %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf32> %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 512], strides = [1, 1] @@ -623,12 +724,19 @@ func.func @gemm_gather() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @reduce_broadcast_generic() { %c0 = arith.constant 0 : index %cst = arith.constant 0.0 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets= [0, 0], sizes = [10, 1024], strides= [1, 1] : !flow.dispatch.tensor> -> tensor<10x1024xf32> %4 = flow.dispatch.tensor.load %1, offsets= [0], sizes = [10], strides= [1] @@ -659,7 +767,7 @@ func.func @reduce_broadcast_generic() { return } // CHECK-LABEL: func @reduce_broadcast_generic -// CHECK: %[[OUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(4) +// CHECK: %[[OUT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK: %[[OUT:.+]] = flow.dispatch.tensor.load %[[OUT_BINDING]] // CHECK: %[[RESULT:.+]]:2 = linalg.generic // CHECK: linalg.generic @@ -668,10 +776,16 @@ func.func @reduce_broadcast_generic() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @pack() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4, 4], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4x4xi32> %3 = tensor.empty() : tensor<2x2x2x2xi32> %pack = tensor.pack %2 inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %3 : tensor<4x4xi32> -> tensor<2x2x2x2xi32> @@ -679,18 +793,24 @@ func.func @pack() { return } // CHECK-LABEL: func.func @pack -// CHECK-DAG: %[[IN_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[OUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[IN_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[OUT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[IN:.+]] = flow.dispatch.tensor.load %[[IN_BINDING]] // CHECK-DAG: %[[OUT:.+]] = flow.dispatch.tensor.load %[[OUT_BINDING]] // CHECK: tensor.pack %[[IN]] inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %[[OUT]] // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @unpack() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 2, 2, 2], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x2x2x2xi32> %3 = tensor.empty() : tensor<4x4xi32> %4 = tensor.unpack %2 inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %3 : tensor<2x2x2x2xi32> -> tensor<4x4xi32> @@ -698,14 +818,20 @@ func.func @unpack() { return } // CHECK-LABEL: func.func @unpack -// CHECK-DAG: %[[IN_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[OUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[IN_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[OUT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[IN:.+]] = flow.dispatch.tensor.load %[[IN_BINDING]] // CHECK-DAG: %[[OUT:.+]] = flow.dispatch.tensor.load %[[OUT_BINDING]] // CHECK: tensor.unpack %[[IN]] inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %[[OUT]] // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -717,8 +843,8 @@ func.func @non_perfect_tiling_unpack() { %0:2 = iree_codegen.query_tile_sizes tensor<16x16xi32, #iree_encoding.encoding> -> index, index %1 = affine.apply affine_map<()[s0] -> (16 ceildiv s0)>()[%0#0] %2 = affine.apply affine_map<()[s0] -> (16 ceildiv s0)>()[%0#1] - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c512) flags(ReadOnly) : !flow.dispatch.tensor>{%1, %2, %0#0, %0#1} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c512) flags(ReadOnly) : !flow.dispatch.tensor>{%1, %2, %0#0, %0#1} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index @@ -751,17 +877,26 @@ func.func @non_perfect_tiling_unpack() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer> + ]> +]> func.func @multi_result_dispatches() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %30 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) + %30 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor> %4 = tensor.empty() : tensor<120x360xf32> %cst = arith.constant 0.0 : f32 @@ -790,12 +925,12 @@ func.func @multi_result_dispatches() { return } // CHECK-LABEL: func @multi_result_dispatches() -// CHECK-DAG: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) -// CHECK-DAG: %[[BIAS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) -// CHECK-DAG: %[[RESULT_BINDING0:.+]] = hal.interface.binding.subspan set(0) binding(3) +// CHECK-DAG: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[BIAS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) +// CHECK-DAG: %[[RESULT_BINDING0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) // CHECK-DAG: %[[RESULT0:.+]] = flow.dispatch.tensor.load %[[RESULT_BINDING0]] -// CHECK-DAG: %[[RESULT_BINDING1:.+]] = hal.interface.binding.subspan set(0) binding(4) +// CHECK-DAG: %[[RESULT_BINDING1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(4) // CHECK-DAG: %[[RESULT1:.+]] = flow.dispatch.tensor.load %[[RESULT_BINDING1]] // CHECK: %[[FILL:.+]] = linalg.fill // CHECK-SAME: outs(%[[RESULT1]] : @@ -813,17 +948,24 @@ func.func @multi_result_dispatches() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @if_conversion() { - %0 = hal.interface.constant.load[0] : index - %offset = hal.interface.constant.load[1] : index - %size = hal.interface.constant.load[2] : index - %cond = hal.interface.constant.load[3] : i1 - %result_offset = hal.interface.constant.load[4] : index - %then = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %offset = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %size = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %cond = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i1 + %result_offset = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : index + %then = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0} - %else = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) + %else = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%0} - %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) + %result = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0} %then_value = flow.dispatch.tensor.load %then, offsets = [%offset], sizes = [%size], strides = [1] : !flow.dispatch.tensor>{%0} -> tensor @@ -839,13 +981,13 @@ func.func @if_conversion() { return } // CHECK-LABEL: func @if_conversion() -// CHECK-DAG: %[[S0:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[S1:.+]] = hal.interface.constant.load[2] -// CHECK-DAG: %[[COND:.+]] = hal.interface.constant.load[3] -// CHECK-DAG: %[[OFFSET:.+]] = hal.interface.constant.load[4] -// CHECK-DAG: %[[THEN_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[ELSE_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) -// CHECK-DAG: %[[RESULT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK-DAG: %[[S0:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) +// CHECK-DAG: %[[S1:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) +// CHECK-DAG: %[[COND:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) +// CHECK-DAG: %[[OFFSET:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) +// CHECK-DAG: %[[THEN_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[ELSE_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[RESULT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-DAG: %[[THEN:.+]] = flow.dispatch.tensor.load %[[THEN_BINDING]] // CHECK-DAG: %[[ELSE:.+]] = flow.dispatch.tensor.load %[[ELSE_BINDING]] // CHECK: scf.if %[[COND]] { @@ -861,21 +1003,27 @@ func.func @if_conversion() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @if_conversion_clone_offsets() { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = hal.interface.constant.load[4] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 %5 = arith.index_castui %0 : i32 to index %6 = arith.index_castui %1 : i32 to index %7 = arith.index_castui %2 : i32 to index %8 = arith.index_castui %3 : i32 to index %9 = arith.index_castui %4 : i32 to index - %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%6, %7} - %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) : !flow.dispatch.tensor>{%8, %9} + %10 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%6, %7} + %11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%5) : !flow.dispatch.tensor>{%8, %9} %12 = affine.apply affine_map<()[s0, s1] -> (-s0 + s1 + (s0 ceildiv 16) * 16)>()[%6, %6] %13 = affine.apply affine_map<()[s0, s1] -> (-s0 + s1 + (s0 ceildiv 16) * 16)>()[%7, %7] %workgroup_id_x = hal.interface.workgroup.id[0] : index diff --git a/compiler/src/iree/compiler/Codegen/Common/test/decompose_conv2d.mlir b/compiler/src/iree/compiler/Codegen/Common/test/decompose_conv2d.mlir index ab1f826171048..38ee86b35ef83 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/decompose_conv2d.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/decompose_conv2d.mlir @@ -3,12 +3,19 @@ #config = #iree_codegen.lowering_config #executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android30"}> #translation = #iree_codegen.translation_info +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> module { func.func @restrict_num_workgroups() attributes {hal.executable.target = #executable_target_system_elf_arm_64_, translation_info = #translation} { %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %input = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 1, 4, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x1x4x4xf32> %filter = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [1, 4, 4], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x4x4xf32> %5 = tensor.empty() : tensor<1x1x1x4xf32> diff --git a/compiler/src/iree/compiler/Codegen/Common/test/eliminate_empty_tensors.mlir b/compiler/src/iree/compiler/Codegen/Common/test/eliminate_empty_tensors.mlir index cc7600b25f3aa..45301a4e5ba8f 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/eliminate_empty_tensors.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/eliminate_empty_tensors.mlir @@ -1,12 +1,18 @@ // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-eliminate-empty-tensors))" %s | FileCheck %s // ----- + +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @eliminate_empty_tensors_with_store_op() { %c0 = arith.constant 0 : index %c8 = arith.constant 8 : index %c32 = arith.constant 32 : index %c128 = arith.constant 128 : index - %0 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> %1 = tensor.empty() : tensor<32x384xf32> scf.for %arg0 = %c0 to %c128 step %c32 { %2 = scf.for %arg1 = %c0 to %c32 step %c8 iter_args(%arg2 = %1) -> (tensor<32x384xf32>) { diff --git a/compiler/src/iree/compiler/Codegen/Common/test/emulate_narrow_type.mlir b/compiler/src/iree/compiler/Codegen/Common/test/emulate_narrow_type.mlir index 80a62e7050e48..84f84cc8a0a31 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/emulate_narrow_type.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/emulate_narrow_type.mlir @@ -1,8 +1,13 @@ // RUN: iree-opt --split-input-file --iree-codegen-emulate-narrow-type %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @memref_i4_to_i8() -> i4 { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<3x15xi4> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<3x15xi4> %1 = memref.load %0[%c0, %c0] : memref<3x15xi4> return %1 : i4 } @@ -11,9 +16,14 @@ func.func @memref_i4_to_i8() -> i4 { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @memref_i4_to_i8_dynamic(%arg0 : index, %arg1 : index, %arg2 : index) -> i4 { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%arg0) flags(ReadOnly) : memref>{%arg1, %arg2} + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%arg0) flags(ReadOnly) : memref>{%arg1, %arg2} %1 = memref.load %0[%c0, %c0] : memref> return %1 : i4 } diff --git a/compiler/src/iree/compiler/Codegen/Common/test/flatten_memref_subspan.mlir b/compiler/src/iree/compiler/Codegen/Common/test/flatten_memref_subspan.mlir index d0b656a0429f1..48c1922c00f46 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/flatten_memref_subspan.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/flatten_memref_subspan.mlir @@ -1,7 +1,12 @@ // RUN: iree-opt --split-input-file --iree-codegen-flatten-memref-subspan --canonicalize --allow-unregistered-dialect %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @load_subspan_with_offset(%offset : index, %i0: index, %i1: index, %i2: index) -> f32 { - %subspan = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%offset) : memref<6x7x8xf32, strided<[56, 8, 1], offset: ?>> + %subspan = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%offset) : memref<6x7x8xf32, strided<[56, 8, 1], offset: ?>> %val = memref.load %subspan[%i0, %i1, %i2] : memref<6x7x8xf32, strided<[56, 8, 1], offset: ?>> return %val: f32 } @@ -12,15 +17,20 @@ func.func @load_subspan_with_offset(%offset : index, %i0: index, %i1: index, %i2 // CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index) // CHECK-DAG: %[[ZERO:.+]] = arith.constant 0 : index // CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]] -// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[ZERO]]) : memref{%[[SIZE]]} +// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%[[ZERO]]) : memref{%[[SIZE]]} // CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]]] // CHECK: %[[LOAD:.+]] = memref.load %[[SUBSPAN]][%[[INDEX]]] // CHECK: return %[[LOAD]] // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @store_subspan_with_offset(%value: f32, %offset : index, %i0: index, %i1: index, %i2: index) { - %subspan = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%offset) : memref<2x3x4xf32, strided<[12, 4, 1], offset: ?>> + %subspan = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%offset) : memref<2x3x4xf32, strided<[12, 4, 1], offset: ?>> memref.store %value, %subspan[%i0, %i1, %i2] : memref<2x3x4xf32, strided<[12, 4, 1], offset: ?>> return } @@ -31,14 +41,19 @@ func.func @store_subspan_with_offset(%value: f32, %offset : index, %i0: index, % // CHECK-SAME: (%[[VALUE:.+]]: f32, %[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index) // CHECK-DAG: %[[ZERO:.+]] = arith.constant 0 : index // CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]] -// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[ZERO]]) : memref{%[[SIZE]]} +// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%[[ZERO]]) : memref{%[[SIZE]]} // CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]]] // CHECK: memref.store %[[VALUE]], %[[SUBSPAN]][%[[INDEX]]] : memref // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @load_subspan_with_vector_element(%offset : index, %i0: index, %i1: index, %i2: index) -> vector<4xf32> { - %subspan = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%offset) : memref<6x7x8xvector<4xf32>, strided<[56, 8, 1], offset:?>> + %subspan = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%offset) : memref<6x7x8xvector<4xf32>, strided<[56, 8, 1], offset:?>> %val = memref.load %subspan[%i0, %i1, %i2] : memref<6x7x8xvector<4xf32>, strided<[56, 8, 1], offset:?>> return %val: vector<4xf32> } @@ -49,8 +64,13 @@ func.func @load_subspan_with_vector_element(%offset : index, %i0: index, %i1: in // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @load_subspan_with_16bit_element(%offset : index, %i0: index, %i1: index, %i2: index) -> f16 { - %subspan = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%offset) : memref<6x7x8xf16, strided<[56, 8, 1], offset:?>> + %subspan = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%offset) : memref<6x7x8xf16, strided<[56, 8, 1], offset:?>> %val = memref.load %subspan[%i0, %i1, %i2] : memref<6x7x8xf16, strided<[56, 8, 1], offset:?>> return %val: f16 } @@ -61,9 +81,15 @@ func.func @load_subspan_with_16bit_element(%offset : index, %i0: index, %i1: ind // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> + func.func @store_subspan_with_leading_dynamic_dim(%value: f32, %offset : index, %i0: index, %i1: index, %i2: index) { - %dim = hal.interface.constant.load[0] : index - %subspan = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%offset) : memref>{%dim} + %dim = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %subspan = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%offset) : memref>{%dim} memref.store %value, %subspan[%i0, %i1, %i2] : memref> return } @@ -73,21 +99,25 @@ func.func @store_subspan_with_leading_dynamic_dim(%value: f32, %offset : index, //CHECK-LABEL: func.func @store_subspan_with_leading_dynamic_dim // CHECK-SAME: (%[[VALUE:.+]]: f32, %[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index) // CHECK: %[[C0:.+]] = arith.constant 0 : index -// CHECK: %[[DIM:.+]] = hal.interface.constant.load[0] : index +// CHECK: %[[DIM:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) : index // CHECK: %[[SIZE:.+]] = affine.apply #[[$SIZE_MAP]]()[%[[DIM]], %[[OFFSET]]] -// CHECK: %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref{%[[SIZE]]} +// CHECK: %[[DST:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%[[C0]]) : memref{%[[SIZE]]} // CHECK: %[[INDEX:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]]] // CHECK: memref.store %[[VALUE]], %[[DST]][%[[INDEX]]] : memref // ----- - +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @store_subspan_with_all_dynamic_dim(%value: f32, %offset : index, %i0: index, %i1: index, %i2: index, %i3: index) { - %dim0 = hal.interface.constant.load[0] : index - %dim1 = hal.interface.constant.load[1] : index - %dim2 = hal.interface.constant.load[2] : index - %dim3 = hal.interface.constant.load[3] : index - %subspan = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%offset) : memref>{%dim0, %dim1, %dim2, %dim3} + %dim0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %dim1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %dim2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %dim3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %subspan = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%offset) : memref>{%dim0, %dim1, %dim2, %dim3} memref.store %value, %subspan[%i0, %i1, %i2, %i3] : memref> return } @@ -97,21 +127,26 @@ func.func @store_subspan_with_all_dynamic_dim(%value: f32, %offset : index, %i0: //CHECK-LABEL: func.func @store_subspan_with_all_dynamic_dim // CHECK-SAME: (%[[VALUE:.+]]: f32, %[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index, %[[I3:.+]]: index) // CHECK: %[[C0:.+]] = arith.constant 0 : index -// CHECK: %[[DIM0:.+]] = hal.interface.constant.load[0] : index -// CHECK: %[[DIM1:.+]] = hal.interface.constant.load[1] : index -// CHECK: %[[DIM2:.+]] = hal.interface.constant.load[2] : index -// CHECK: %[[DIM3:.+]] = hal.interface.constant.load[3] : index +// CHECK: %[[DIM0:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) : index +// CHECK: %[[DIM1:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) : index +// CHECK: %[[DIM2:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) : index +// CHECK: %[[DIM3:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(3) : index // CHECK: %[[SIZE:.+]] = affine.apply #[[$SIZE_MAP]]()[%[[DIM0]], %[[DIM1]], %[[DIM2]], %[[DIM3]], %[[OFFSET]]] -// CHECK: %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref{%[[SIZE]]} +// CHECK: %[[DST:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%[[C0]]) : memref{%[[SIZE]]} // CHECK: %[[INDEX:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[OFFSET]], %[[DIM3]], %[[I3]], %[[DIM2]], %[[I2]], %[[I0]], %[[DIM1]], %[[I1]]] // CHECK: memref.store %[[VALUE]], %[[DST]][%[[INDEX]]] // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @store_subspan_with_mixed_dynamic_dim(%value: f32, %offset : index, %i0: index, %i1: index, %i2: index, %i3: index) { - %dim0 = hal.interface.constant.load[0] : index - %dim1 = hal.interface.constant.load[1] : index - %subspan = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%offset) : memref>{%dim0, %dim1} + %dim0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %dim1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %subspan = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%offset) : memref>{%dim0, %dim1} memref.store %value, %subspan[%i0, %i1, %i2, %i3] : memref> return } @@ -121,18 +156,23 @@ func.func @store_subspan_with_mixed_dynamic_dim(%value: f32, %offset : index, %i //CHECK-LABEL: func.func @store_subspan_with_mixed_dynamic_dim // CHECK-SAME: (%[[VALUE:.+]]: f32, %[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index, %[[I3:.+]]: index) // CHECK: %[[C0:.+]] = arith.constant 0 : index -// CHECK: %[[DIM0:.+]] = hal.interface.constant.load[0] : index -// CHECK: %[[DIM2:.+]] = hal.interface.constant.load[1] : index +// CHECK: %[[DIM0:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) : index +// CHECK: %[[DIM2:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) : index // CHECK: %[[SIZE:.+]] = affine.apply #[[$SIZE_MAP]]()[%[[DIM0]], %[[DIM2]], %[[OFFSET]]] -// CHECK: %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref{%[[SIZE]]} +// CHECK: %[[DST:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%[[C0]]) : memref{%[[SIZE]]} // CHECK: %[[INDEX:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[OFFSET]], %[[I3]], %[[DIM2]], %[[I2]], %[[I0]], %[[I1]]] // CHECK: memref.store %[[VALUE]], %[[DST]][%[[INDEX]]] // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @store_subspan_with_flow_control(%value: f32, %offset : index, %i0: index, %i1: index, %i2: index) { - %dim = hal.interface.constant.load[0] : index - %subspan = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%offset) : memref>{%dim} + %dim = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %subspan = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%offset) : memref>{%dim} scf.for %i = %i0 to %i1 step %i2 { memref.store %value, %subspan[%i0, %i1, %i2] : memref> } @@ -144,9 +184,9 @@ func.func @store_subspan_with_flow_control(%value: f32, %offset : index, %i0: in //CHECK-LABEL: func.func @store_subspan_with_flow_control // CHECK-SAME: (%[[VALUE:.+]]: f32, %[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index) // CHECK: %[[C0:.+]] = arith.constant 0 : index -// CHECK: %[[DIM:.+]] = hal.interface.constant.load[0] : index +// CHECK: %[[DIM:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) : index // CHECK: %[[SIZE:.+]] = affine.apply #[[$SIZE_MAP]]()[%[[DIM]], %[[OFFSET]]] -// CHECK: %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref{%[[SIZE]]} +// CHECK: %[[DST:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%[[C0]]) : memref{%[[SIZE]]} // CHECK: scf.for // CHECK: %[[INDEX:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]]] // CHECK: memref.store %[[VALUE]], %[[DST]][%[[INDEX]]] : memref @@ -216,9 +256,14 @@ func.func @load_global_with_offset(%i0: index, %i1: index, %i2: index, %i3: inde // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @transfer_read_subspan_with_offset( %arg0 : index, %arg1: index, %arg2: index, %arg3: index) -> vector<4xf32> { - %subspan = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%arg0) : memref<6x7x8xf32, strided<[56, 8, 1], offset:?>> + %subspan = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%arg0) : memref<6x7x8xf32, strided<[56, 8, 1], offset:?>> %cst = arith.constant 0.0 : f32 %val = vector.transfer_read %subspan[%arg1, %arg2, %arg3], %cst {in_bounds = [true]} : memref<6x7x8xf32, strided<[56, 8, 1], offset:?>>, vector<4xf32> return %val: vector<4xf32> @@ -233,16 +278,21 @@ func.func @transfer_read_subspan_with_offset( // CHECK-SAME: %[[ARG3:[a-zA-Z0-9_]+]]: index // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[ARG0]]] -// CHECK: %[[MEMREF:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref{%[[SIZE]]} +// CHECK: %[[MEMREF:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%[[C0]]) : memref{%[[SIZE]]} // CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]] // CHECK: %[[VEC:.+]] = vector.transfer_read %[[MEMREF]][%[[INDEX]]] // CHECK: return %[[VEC]] // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @transfer_write_subspan_with_offset( %arg0 : index, %arg1: index, %arg2: index, %arg3: index, %arg4 : vector<4xf32>) { - %subspan = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%arg0) : memref<6x7x8xf32, strided<[56, 8, 1], offset:?>> + %subspan = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%arg0) : memref<6x7x8xf32, strided<[56, 8, 1], offset:?>> vector.transfer_write %arg4, %subspan[%arg1, %arg2, %arg3] {in_bounds = [true]} : vector<4xf32>, memref<6x7x8xf32, strided<[56, 8, 1], offset:?>> return } @@ -257,15 +307,21 @@ func.func @transfer_write_subspan_with_offset( // CHECK-SAME: %[[ARG4:[a-zA-Z0-9_]+]]: vector<4xf32> // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[ARG0]]] -// CHECK: %[[MEMREF:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref{%[[SIZE]]} +// CHECK: %[[MEMREF:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%[[C0]]) : memref{%[[SIZE]]} // CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]] // CHECK: vector.transfer_write %[[ARG4]], %[[MEMREF]][%[[INDEX]]] // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @load_store_subspan_with_zero_offset(%arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index) { - %subspan0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref{%arg0, %arg1} - %subspan1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref{%arg0, %arg1} + %subspan0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref{%arg0, %arg1} + %subspan1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref{%arg0, %arg1} %val = memref.load %subspan0[%arg2, %arg3] : memref memref.store %val, %subspan1[%arg2, %arg3] : memref return @@ -279,9 +335,9 @@ func.func @load_store_subspan_with_zero_offset(%arg0 : index, %arg1 : index, %ar // CHECK-SAME: %[[ARG3:[a-zA-Z0-9]+]]: index // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: %[[D0:.+]] = affine.apply #[[$MAP0]]()[%[[ARG0]], %[[ARG1]]] -// CHECK: %[[BINDING0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref{%[[D0]]} +// CHECK: %[[BINDING0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%[[C0]]) : memref{%[[D0]]} // CHECK: %[[D1:.+]] = affine.apply #[[$MAP0]]()[%[[ARG0]], %[[ARG1]]] -// CHECK: %[[BINDING1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%[[C0]]) : memref{%[[D1]]} +// CHECK: %[[BINDING1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) offset(%[[C0]]) : memref{%[[D1]]} // CHECK: %[[OFFSET0:.+]] = affine.apply #[[$MAP1]]()[%[[ARG2]], %[[ARG1]], %[[ARG3]]] // CHECK: %[[VAL:.+]] = memref.load %[[BINDING0]][%[[OFFSET0]]] // CHECK: %[[OFFSET1:.+]] = affine.apply #[[$MAP1]]()[%[[ARG2]], %[[ARG1]], %[[ARG3]]] @@ -289,10 +345,16 @@ func.func @load_store_subspan_with_zero_offset(%arg0 : index, %arg1 : index, %ar // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @load_store_rank_zero_subspan_with_zero_offset() { %zero = arith.constant 0 : index - %subspan0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%zero) : memref - %subspan1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%zero) : memref + %subspan0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%zero) : memref + %subspan1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) offset(%zero) : memref %val = memref.load %subspan0[] : memref memref.store %val, %subspan1[] : memref return @@ -300,14 +362,20 @@ func.func @load_store_rank_zero_subspan_with_zero_offset() { //CHECK-LABEL: func.func @load_store_rank_zero_subspan_with_zero_offset // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK: %[[SPAN0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref -// CHECK: %[[SPAN1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%[[C0]]) : memref +// CHECK: %[[SPAN0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%[[C0]]) : memref +// CHECK: %[[SPAN1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) offset(%[[C0]]) : memref // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @load_store_rank_zero_subspan_with_offset(%offset : index) { - %subspan0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%offset) : memref> - %subspan1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%offset) : memref> + %subspan0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%offset) : memref> + %subspan1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) offset(%offset) : memref> %val = memref.load %subspan0[] : memref> memref.store %val, %subspan1[] : memref> return @@ -319,9 +387,9 @@ func.func @load_store_rank_zero_subspan_with_offset(%offset : index) { // CHECK-SAME: (%[[OFFSET:.+]]: index) // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[SIZE0:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]] -// CHECK: %[[SPAN0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref{%[[SIZE0]]} +// CHECK: %[[SPAN0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%[[C0]]) : memref{%[[SIZE0]]} // CHECK-DAG: %[[SIZE1:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]] -// CHECK: %[[SPAN1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%[[C0]]) : memref{%[[SIZE1]]} +// CHECK: %[[SPAN1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) offset(%[[C0]]) : memref{%[[SIZE1]]} // CHECK: %[[INDEX0:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]]] // CHECK: %[[LOAD:.+]] = memref.load %[[SPAN0]][%[[INDEX0]]] : memref // CHECK: %[[INDEX1:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]]] @@ -329,8 +397,13 @@ func.func @load_store_rank_zero_subspan_with_offset(%offset : index) { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @collapse_shape(%offset : index, %i0 : index, %i1 : index) -> f32 { - %subspan = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%offset) : memref<4x5x6x7xf32, strided<[210, 42, 7, 1], offset:?>> + %subspan = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%offset) : memref<4x5x6x7xf32, strided<[210, 42, 7, 1], offset:?>> %collapse = memref.collapse_shape %subspan[[0, 1], [2, 3]] : memref<4x5x6x7xf32, strided<[210, 42, 7, 1], offset:?>> into memref<20x42xf32, strided<[42, 1], offset:?>> %value = memref.load %collapse[%i0, %i1] : memref<20x42xf32, strided<[42, 1], offset:?>> return %value : f32 @@ -342,14 +415,19 @@ func.func @collapse_shape(%offset : index, %i0 : index, %i1 : index) -> f32 { // CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index) // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]] -// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref{%[[SIZE]]} +// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%[[C0]]) : memref{%[[SIZE]]} // CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]]] // CHECK: memref.load %[[SUBSPAN]][%[[INDEX]]] // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @expand_shape(%offset : index, %i0: index, %i1: index, %i2: index, %i3: index) -> f32 { - %subspan = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%offset) : memref<20x42xf32, strided<[42, 1], offset:?>> + %subspan = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%offset) : memref<20x42xf32, strided<[42, 1], offset:?>> %expand = memref.expand_shape %subspan[[0, 1], [2, 3]] output_shape [4, 5, 6, 7] : memref<20x42xf32, strided<[42, 1], offset:?>> into memref<4x5x6x7xf32, strided<[210, 42, 7, 1], offset:?>> %value = memref.load %expand[%i0, %i1, %i2, %i3] : memref<4x5x6x7xf32, strided<[210, 42, 7, 1], offset:?>> return %value : f32 @@ -361,14 +439,19 @@ func.func @expand_shape(%offset : index, %i0: index, %i1: index, %i2: index, %i3 // CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index, %[[I3:.+]]: index) // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]] -// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref{%[[SIZE]]} +// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%[[C0]]) : memref{%[[SIZE]]} // CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]], %[[I3]]] // CHECK: memref.load %[[SUBSPAN]][%[[INDEX]]] // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @expand_shape2(%offset : index, %i0: index, %i1: index) -> f32 { - %subspan = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%offset) : memref<128xf32, strided<[1], offset: ?>> + %subspan = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%offset) : memref<128xf32, strided<[1], offset: ?>> %expand = memref.expand_shape %subspan [[0, 1]] output_shape [1, 128] : memref<128xf32, strided<[1], offset: ?>> into memref<1x128xf32, strided<[128, 1], offset: ?>> %value = memref.load %expand[%i0, %i1] : memref<1x128xf32, strided<[128, 1], offset: ?>> return %value : f32 @@ -380,7 +463,7 @@ func.func @expand_shape2(%offset : index, %i0: index, %i1: index) -> f32 { // CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index) // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]] -// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref{%[[SIZE]]} +// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%[[C0]]) : memref{%[[SIZE]]} // CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]]] // CHECK: memref.load %[[SUBSPAN]][%[[INDEX]]] @@ -389,8 +472,14 @@ func.func @expand_shape2(%offset : index, %i0: index, %i1: index) -> f32 { // An opaque consumer that already takes a collapsed, static 1d memref should // be able to do so (a memref cast is inserted to move between unknown and // known dim). + +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @static_collapse_shape_to_1d_static(%offset : index, %i: index) { - %subspan = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%offset) : memref<6x7x8xf32, strided<[56, 8, 1], offset:?>> + %subspan = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%offset) : memref<6x7x8xf32, strided<[56, 8, 1], offset:?>> %collapse = memref.collapse_shape %subspan [[0, 1, 2]] : memref<6x7x8xf32, strided<[56, 8, 1], offset:?>> into memref<336xf32, strided<[1], offset: ?>> "unregistered.opaque"(%collapse) : (memref<336xf32, strided<[1], offset: ?>>) -> () } @@ -402,15 +491,20 @@ func.func @static_collapse_shape_to_1d_static(%offset : index, %i: index) { // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[OFFSET:.+]] = affine.apply #[[$MAP0]]()[%[[ARG0]] // CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP1]]()[%[[ARG0]] -// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref{%[[SIZE]]} +// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%[[C0]]) : memref{%[[SIZE]]} // CHECK: %[[SUBVIEW:.+]] = memref.subview %[[SUBSPAN]][%[[OFFSET]]] [336] [1] : memref to memref<336xf32, strided<[1], offset: ?>> // CHECK: "unregistered.opaque"(%[[SUBVIEW]]) // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @subview(%offset : index, %i0: index, %i1: index) -> f32 { %c0 = arith.constant 0 : index - %subspan = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%offset) : memref<32x128xf32, strided<[128, 1], offset: ?>> + %subspan = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%offset) : memref<32x128xf32, strided<[128, 1], offset: ?>> %expand = memref.subview %subspan[%i0, %i1][16, 8][1, 1] : memref<32x128xf32, strided<[128, 1], offset: ?>> to memref<16x8xf32, strided<[128, 1], offset: ?>> %value = memref.load %expand[%c0, %c0] : memref<16x8xf32, strided<[128, 1], offset: ?>> return %value : f32 @@ -422,7 +516,7 @@ func.func @subview(%offset : index, %i0: index, %i1: index) -> f32 { // CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index) // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]] -// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[C0]]) : memref{%[[SIZE]]} +// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%[[C0]]) : memref{%[[SIZE]]} // CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]]] // CHECK: memref.load %[[SUBSPAN]][%[[INDEX]]] @@ -459,8 +553,13 @@ func.func @subgroup_mma_store(%i0: index, %i1: index, %val: !gpu.mma_matrix<16x1 // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @subgroup_mma_load_with_offset(%offset : index, %i0: index, %i1: index) -> !gpu.mma_matrix<16x16xf16, "AOp"> { - %subspan = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%offset) : memref<32x32xf16, strided<[32, 1], offset: ?>, 3> + %subspan = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%offset) : memref<32x32xf16, strided<[32, 1], offset: ?>, 3> %0 = gpu.subgroup_mma_load_matrix %subspan[%i0, %i1] {leadDimension = 32 : index} : memref<32x32xf16, strided<[32, 1], offset: ?>, 3> -> !gpu.mma_matrix<16x16xf16, "AOp"> return %0 : !gpu.mma_matrix<16x16xf16, "AOp"> } @@ -471,15 +570,20 @@ func.func @subgroup_mma_load_with_offset(%offset : index, %i0: index, %i1: index // CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index) // CHECK-DAG: %[[ZERO:.+]] = arith.constant 0 : index // CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]]] -// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[ZERO]]) : memref{%[[SIZE]]} +// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%[[ZERO]]) : memref{%[[SIZE]]} // CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP2]]()[%[[OFFSET]], %[[I0]], %[[I1]]] // CHECK: %[[LD:.+]] = gpu.subgroup_mma_load_matrix %[[SUBSPAN]][%[[INDEX]]] {leadDimension = 32 : index} // CHECK: return %[[LD]] // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @subgroup_mma_store_with_offset(%offset : index, %i0: index, %i1: index, %val: !gpu.mma_matrix<16x16xf16, "COp">) { - %subspan = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%offset) : memref<32x32xf16, strided<[32, 1], offset: ?>, 3> + %subspan = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%offset) : memref<32x32xf16, strided<[32, 1], offset: ?>, 3> gpu.subgroup_mma_store_matrix %val, %subspan[%i0, %i1] {leadDimension = 128 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<32x32xf16, strided<[32, 1], offset: ?>, 3> return } @@ -490,14 +594,19 @@ func.func @subgroup_mma_store_with_offset(%offset : index, %i0: index, %i1: inde // CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[VAL:.+]]: !gpu.mma_matrix<16x16xf16, "COp"> // CHECK-DAG: %[[ZERO:.+]] = arith.constant 0 : index // CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]]] -// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%[[ZERO]]) : memref{%[[SIZE]]} +// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%[[ZERO]]) : memref{%[[SIZE]]} // CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP2]]()[%[[OFFSET]], %[[I0]], %[[I1]]] // CHECK: gpu.subgroup_mma_store_matrix %[[VAL]], %[[SUBSPAN]][%[[INDEX]]] {leadDimension = 128 : index} // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @load_uniform_buffer(%offset: index, %i0: index, %i1 : index, %i2: index) -> i32 { - %subspan = hal.interface.binding.subspan set(0) binding(0) type(uniform_buffer) offset(%offset) : memref<2x3x4xi32, strided<[12, 4, 1], offset:?>, #hal.descriptor_type> + %subspan = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%offset) : memref<2x3x4xi32, strided<[12, 4, 1], offset:?>, #hal.descriptor_type> %val = memref.load %subspan[%i0, %i1, %i2] : memref<2x3x4xi32, strided<[12, 4, 1], offset:?>, #hal.descriptor_type> return %val: i32 } @@ -506,7 +615,7 @@ func.func @load_uniform_buffer(%offset: index, %i0: index, %i1 : index, %i2: ind // CHECK-LABEL: func.func @load_uniform_buffer // CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index) // CHECK: %[[C0:.+]] = arith.constant 0 : index -// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(uniform_buffer) offset(%[[C0]]) : memref> +// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%[[C0]]) : memref> // CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]]] // CHECK: %[[LD:.+]] = memref.load %[[SUBSPAN]][%[[INDEX]]] : memref> // CHECK: return %[[LD]] : i32 @@ -514,8 +623,13 @@ func.func @load_uniform_buffer(%offset: index, %i0: index, %i1 : index, %i2: ind // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @store_uniform_buffer(%value : i32, %offset: index, %i0: index, %i1 : index, %i2: index) { - %subspan = hal.interface.binding.subspan set(0) binding(0) type(uniform_buffer) offset(%offset) : memref<2x3x4xi32, strided<[12, 4, 1], offset:?>, #hal.descriptor_type> + %subspan = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%offset) : memref<2x3x4xi32, strided<[12, 4, 1], offset:?>, #hal.descriptor_type> memref.store %value, %subspan[%i0, %i1, %i2] : memref<2x3x4xi32, strided<[12, 4, 1], offset:?>, #hal.descriptor_type> return } @@ -526,16 +640,21 @@ func.func @store_uniform_buffer(%value : i32, %offset: index, %i0: index, %i1 : // CHECK-SAME: (%[[VAL:.+]]: i32, %[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index, %[[I2:.+]]: index) // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: %[[SIZE:.+]] = affine.apply #[[$MAP0]]()[%[[OFFSET]]] -// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(uniform_buffer) offset(%[[C0]]) : memref>{%[[SIZE]]} +// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%[[C0]]) : memref>{%[[SIZE]]} // CHECK: %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[OFFSET]], %[[I0]], %[[I1]], %[[I2]]] // CHECK: memref.store %[[VAL]], %[[SUBSPAN]][%[[INDEX]]] : memref> // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @reinterpret_cast_lowering_static_zero_offset() -> f32 { - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref{%0, %1} + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref{%0, %1} %3 = memref.reinterpret_cast %2 to offset: [0], sizes: [], strides: [] : memref to memref %4 = memref.load %3[] : memref return %4 : f32 @@ -545,11 +664,16 @@ func.func @reinterpret_cast_lowering_static_zero_offset() -> f32 { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @reinterpret_cast_lowering_dynamic_zero_offset() -> f32 { %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref{%0, %1} + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref{%0, %1} %3 = memref.reinterpret_cast %2 to offset: [%c0], sizes: [], strides: [] : memref to memref %4 = memref.load %3[] : memref return %4 : f32 diff --git a/compiler/src/iree/compiler/Codegen/Common/test/fold_affine_min_of_block_id.mlir b/compiler/src/iree/compiler/Codegen/Common/test/fold_affine_min_of_block_id.mlir index a4d0740e000fe..b2529d63a1620 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/fold_affine_min_of_block_id.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/fold_affine_min_of_block_id.mlir @@ -1,8 +1,14 @@ // RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-fold-affinemin-in-distributed-loops, canonicalize)))))' %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable public @generic_static { hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @generic_static ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer>]>]>) { + hal.executable.export public @generic_static ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): %c128 = arith.constant 128 : index %c1 = arith.constant 1 : index @@ -17,8 +23,8 @@ hal.executable public @generic_static { // CHECK:} -> tensor<32x32xf32> // CHECK: flow.dispatch.tensor.store {{.*}} sizes = [32, 32], strides = [1, 1] : tensor<32x32xf32> -> !flow.dispatch.tensor> %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %2 = affine.min affine_map<()[s0] -> (32, s0 * -32 + 4096)>()[%workgroup_id_y] diff --git a/compiler/src/iree/compiler/Codegen/Common/test/hoist_unrolled_vector_extract_insert_slice.mlir b/compiler/src/iree/compiler/Codegen/Common/test/hoist_unrolled_vector_extract_insert_slice.mlir index aa2ccccabbb6c..2e1ab038567b2 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/hoist_unrolled_vector_extract_insert_slice.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/hoist_unrolled_vector_extract_insert_slice.mlir @@ -1,16 +1,23 @@ // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-hoist-vector-extract-insert-slice))" %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @hoist_unrolled_vector_for_mma() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %cst_0 = arith.constant dense<0.000000e+00> : vector<32x32xf32> %c64 = arith.constant 64 : index %c2048 = arith.constant 2048 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<3456x2048xf16> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<3456x2048xf16> memref.assume_alignment %0, 64 : memref<3456x2048xf16> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x1024xf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x1024xf16> memref.assume_alignment %1, 64 : memref<2048x1024xf16> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<3456x1024xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<3456x1024xf32> memref.assume_alignment %2, 64 : memref<3456x1024xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %3 = gpu.thread_id x diff --git a/compiler/src/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir b/compiler/src/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir index f360aad2a3a42..c96a6dbe4e080 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/iree_comprehensive_bufferize.mlir @@ -1,14 +1,22 @@ // RUN: iree-opt %s --pass-pipeline="builtin.module(func.func(iree-codegen-iree-comprehensive-bufferize, canonicalize, cse, canonicalize))" --split-input-file | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> func.func @matmul() { %c0 = arith.constant 0 : index - %m = hal.interface.constant.load[0] : index - %n = hal.interface.constant.load[1] : index - %k = hal.interface.constant.load[2] : index - %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%m, %k} - %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%k, %n} - %init = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%m, %n} - %result = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor>{%m, %n} + %m = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %n = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %k = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %lhs = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%m, %k} + %rhs = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%k, %n} + %init = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%m, %n} + %result = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%m, %n} %wg_id_y = hal.interface.workgroup.id[1] : index %wg_count_y = hal.interface.workgroup.count[1] : index %wg_size_y = hal.interface.workgroup.size[1] : index @@ -36,13 +44,13 @@ func.func @matmul() { // CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0, s1] -> (s0 * s1)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (-d0 + s1, s0)> // CHECK: func.func @matmul() -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] -// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[INIT:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) -// CHECK-DAG: %[[RESULT:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) +// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[INIT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) +// CHECK-DAG: %[[RESULT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) // CHECK-DAG: %[[WG_ID_Y:.+]] = hal.interface.workgroup.id[1] // CHECK-DAG: %[[WG_COUNT_Y:.+]] = hal.interface.workgroup.count[1] // CHECK-DAG: %[[WG_SIZE_Y:.+]] = hal.interface.workgroup.size[1] @@ -70,18 +78,25 @@ func.func @matmul() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @matmul_fill() { %cst = arith.constant 0.0 : f32 %c0 = arith.constant 0 : index %c1024 = arith.constant 1024 : index - %m = hal.interface.constant.load[0] : index - %n = hal.interface.constant.load[1] : index - %k = hal.interface.constant.load[2] : index - %base_offset_i32 = hal.interface.constant.load[3] alignment(8) : i32 + %m = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %n = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %k = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %base_offset_i32 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) alignment(8) : i32 %base_offset = arith.index_castui %base_offset_i32 : i32 to index - %lhs = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) : !flow.dispatch.tensor>{%m, %k} - %rhs = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%base_offset) : !flow.dispatch.tensor>{%k, %n} - %result = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c1024) : !flow.dispatch.tensor>{%m, %n} + %lhs = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) : !flow.dispatch.tensor>{%m, %k} + %rhs = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%base_offset) : !flow.dispatch.tensor>{%k, %n} + %result = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c1024) : !flow.dispatch.tensor>{%m, %n} %wg_id_y = hal.interface.workgroup.id[1] : index %wg_count_y = hal.interface.workgroup.count[1] : index %wg_size_y = hal.interface.workgroup.size[1] : index @@ -111,16 +126,16 @@ func.func @matmul_fill() { // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (-d0 + s1, s0)> // CHECK: func.func @matmul_fill() // CHECK-DAG: %[[CST:.+]] = arith.constant 0.000000e+00 : f32 -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] -// CHECK-DAG: %[[BASE_OFFSET_I32:.+]] = hal.interface.constant.load[3] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) +// CHECK-DAG: %[[BASE_OFFSET_I32:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(3) // CHECK-DAG: %[[BASE_OFFSET:.+]] = arith.index_castui %[[BASE_OFFSET_I32]] -// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) +// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(32) // CHECK-DAG: memref.assume_alignment %[[LHS]], 32 -// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%[[BASE_OFFSET]]) +// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%[[BASE_OFFSET]]) // CHECK-DAG: memref.assume_alignment %[[RHS]], 8 -// CHECK-DAG: %[[RESULT:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c1024) +// CHECK-DAG: %[[RESULT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) alignment(64) offset(%c1024) // CHECK-DAG: memref.assume_alignment %[[RESULT]], 64 // CHECK-DAG: %[[WG_ID_Y:.+]] = hal.interface.workgroup.id[1] // CHECK-DAG: %[[WG_COUNT_Y:.+]] = hal.interface.workgroup.count[1] @@ -149,6 +164,12 @@ func.func @matmul_fill() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @elementwise() { %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index @@ -156,8 +177,8 @@ func.func @elementwise() { %c512 = arith.constant 512 : index %c64 = arith.constant 64 : index %c10 = arith.constant 10 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c512) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c64) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c512) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c64) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %2 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x] @@ -192,8 +213,8 @@ func.func @elementwise() { // CHECK: func.func @elementwise() // CHECK-DAG: %[[CST_TENSOR:.+]] = arith.constant dense_resource<__elided__> : tensor<1x10xf32> // CHECK-DAG: %[[CST_BUF:.+]] = bufferization.to_memref %[[CST_TENSOR]] -// CHECK-DAG: %[[IN_BUF:.+]] = hal.interface.binding.subspan set(0) binding(0) {{.+}} : memref<1x10xf32, strided<[10, 1], offset: 128>, #hal.descriptor_type> -// CHECK-DAG: %[[OUT_BUF:.+]] = hal.interface.binding.subspan set(0) binding(1) {{.+}} : memref<1x10xf32, strided<[10, 1], offset: 16>, #hal.descriptor_type> +// CHECK-DAG: %[[IN_BUF:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) {{.+}} : memref<1x10xf32, strided<[10, 1], offset: 128>, #hal.descriptor_type> +// CHECK-DAG: %[[OUT_BUF:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) {{.+}} : memref<1x10xf32, strided<[10, 1], offset: 16>, #hal.descriptor_type> // CHECK: scf.for // CHECK-DAG: %[[SUB_IN1:.+]] = memref.subview %[[IN_BUF]] // CHECK-DAG: %[[SUB_OUT1:.+]] = memref.subview %[[OUT_BUF]] @@ -207,12 +228,18 @@ func.func @elementwise() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map0 = affine_map<()[s0] -> (s0 * 2)> #map1 = affine_map<(d0) -> (d0)> func.func @rank_reduced_slice() { %c10 = arith.constant 10 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %3 = affine.apply #map0()[%workgroup_id_x] @@ -230,8 +257,8 @@ func.func @rank_reduced_slice() { return } // CHECK: func.func @rank_reduced_slice() -// CHECK-DAG: %[[SRC_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1x40xf32, #hal.descriptor_type> -// CHECK-DAG: %[[DST_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<10xf32, #hal.descriptor_type> +// CHECK-DAG: %[[SRC_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<1x40xf32, #hal.descriptor_type> +// CHECK-DAG: %[[DST_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref<10xf32, #hal.descriptor_type> // CHECK: scf.for %[[IV0:.+]] = // CHECK-DAG: %[[SRC_SUBVIEW:.+]] = memref.subview %[[SRC_BINDING]][0, %[[IV0]]] [1, 2] [1, 1] : memref<1x40xf32{{.+}}> to memref<2xf32 // CHECK-DAG: %[[DST_SUBVIEW:.+]] = memref.subview %[[DST_BINDING]][%[[IV0]]] [2] [1] : memref<10xf32{{.+}}> to memref<2xf32 @@ -241,17 +268,24 @@ func.func @rank_reduced_slice() { // ----- -// Check that there are no errors in early bufferized copy ops. The +// Checks that there are no errors in early bufferized copy ops. The // bufferization pass should make it as it is. + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @early_bufferized_copy_cst_ops() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index %cst = arith.constant dense<0> : tensor<2x3xi32> %0 = bufferization.to_memref %cst : memref<2x3xi32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<2x5xi32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<2x5xi32> memref.assume_alignment %1, 64 : memref<2x5xi32> - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = memref.subview %1[%c0, %c2] [2, 3] [%c1, %c1] : memref<2x5xi32> to memref<2x3xi32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>> linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%0 : memref<2x3xi32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>>) outs(%3 : memref<2x3xi32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)>>) { ^bb0(%arg0: i32, %arg1: i32): @@ -265,16 +299,23 @@ func.func @early_bufferized_copy_cst_ops() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @tile_from_tensor_load_inplace() { %c2 = arith.constant 2 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %2} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %1} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index scf.for %arg0 = %workgroup_id_y to %c2 step %c2 { @@ -290,9 +331,9 @@ func.func @tile_from_tensor_load_inplace() { } // CHECK-LABEL: func.func @tile_from_tensor_load_inplace() -// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) +// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK: scf.for %[[IV0:.+]] = {{.+}} { // CHECK: scf.for %[[IV1:.+]] = {{.+}} { // CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1] @@ -304,17 +345,25 @@ func.func @tile_from_tensor_load_inplace() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> func.func @tile_from_tensor_load_inplace_and_copy() { %c2 = arith.constant 2 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %2} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %1} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%0, %1} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index scf.for %arg0 = %workgroup_id_y to %c2 step %c2 { @@ -331,10 +380,10 @@ func.func @tile_from_tensor_load_inplace_and_copy() { } // CHECK-LABEL: func.func @tile_from_tensor_load_inplace_and_copy() -// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[RETURN1:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) -// CHECK-DAG: %[[RETURN2:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) +// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[RETURN1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) +// CHECK-DAG: %[[RETURN2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) // CHECK: scf.for %[[IV0:.+]] = {{.+}} { // CHECK: scf.for %[[IV1:.+]] = {{.+}} { // CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1] @@ -348,17 +397,24 @@ func.func @tile_from_tensor_load_inplace_and_copy() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1) -> (d0, d1)> func.func @tile_from_pointwise_lhs_inplace() { %c2 = arith.constant 2 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %2} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %1} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index scf.for %arg0 = %workgroup_id_y to %c2 step %c2 { @@ -380,9 +436,9 @@ func.func @tile_from_pointwise_lhs_inplace() { } // CHECK-LABEL: func.func @tile_from_pointwise_lhs_inplace() -// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) +// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK: scf.for %[[IV0:.+]] = {{.+}} { // CHECK: scf.for %[[IV1:.+]] = {{.+}} { // CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1] @@ -398,18 +454,26 @@ func.func @tile_from_pointwise_lhs_inplace() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map = affine_map<(d0, d1) -> (d0, d1)> func.func @tile_from_pointwise_outs() { %c2 = arith.constant 2 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %2} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %1} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%0, %1} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index scf.for %arg0 = %workgroup_id_y to %c2 step %c2 { @@ -430,10 +494,10 @@ func.func @tile_from_pointwise_outs() { return } // CHECK-LABEL: func.func @tile_from_pointwise_outs() -// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[TENSOR_INIT:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) -// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) +// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[TENSOR_INIT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) +// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) // CHECK: scf.for %[[IV0:.+]] = {{.+}} { // CHECK: scf.for %[[IV1:.+]] = {{.+}} { // CHECK-DAG: %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1] @@ -449,18 +513,25 @@ func.func @tile_from_pointwise_outs() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1) -> (d0, d1)> func.func @tile_from_pointwise_outs_inplace() { %cst = arith.constant 1.000000e+00 : f32 %c2 = arith.constant 2 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %2} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %1} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index scf.for %arg0 = %workgroup_id_y to %c2 step %c2 { @@ -481,9 +552,9 @@ func.func @tile_from_pointwise_outs_inplace() { } // CHECK-LABEL: func.func @tile_from_pointwise_outs_inplace() -// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) +// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK: scf.for %[[IV0:.+]] = {{.+}} { // CHECK: scf.for %[[IV1:.+]] = {{.+}} { // CHECK-DAG: %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1] @@ -497,16 +568,23 @@ func.func @tile_from_pointwise_outs_inplace() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @tile_from_matmul_outs_inplace() { %c2 = arith.constant 2 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %2} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %1} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index scf.for %arg0 = %workgroup_id_y to %c2 step %c2 { @@ -523,9 +601,9 @@ func.func @tile_from_matmul_outs_inplace() { } // CHECK-LABEL: func.func @tile_from_matmul_outs_inplace() -// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) +// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK: scf.for %[[IV0:.+]] = {{.+}} { // CHECK: scf.for %[[IV1:.+]] = {{.+}} { // CHECK-DAG: %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1] @@ -538,19 +616,26 @@ func.func @tile_from_matmul_outs_inplace() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map0 = affine_map<(d0)[s0, s1] -> (-d0 + s0, s1)> #map1 = affine_map<(d0)[s0, s1] -> (-d0 + s1, s0)> func.func @bufferize_dynamic_inplace() { %c1 = arith.constant 1 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.constant.load[3] : index - %4 = hal.interface.constant.load[4] : index - %5 = hal.interface.constant.load[5] : index - %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %8 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%4, %5} + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : index + %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : index + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %1} + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %3} + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%4, %5} %workgroup_size_x = hal.interface.workgroup.size[0] : index %workgroup_size_y = hal.interface.workgroup.size[1] : index %workgroup_id_x = hal.interface.workgroup.id[0] : index @@ -580,15 +665,15 @@ func.func @bufferize_dynamic_inplace() { // CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (-d0 + s0, s1)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0, s1] -> (-d0 + s1, s0)> // CHECK: func.func @bufferize_dynamic_inplace() -// CHECK: %[[DIM0:.+]] = hal.interface.constant.load[0] : index -// CHECK: %[[DIM1:.+]] = hal.interface.constant.load[1] : index -// CHECK: %[[DIM2:.+]] = hal.interface.constant.load[2] : index -// CHECK: %[[DIM3:.+]] = hal.interface.constant.load[3] : index -// CHECK: %[[DIM4:.+]] = hal.interface.constant.load[4] : index -// CHECK: %[[DIM5:.+]] = hal.interface.constant.load[5] : index -// CHECK: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref>{%[[DIM0]], %[[DIM1]]} -// CHECK: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref>{%[[DIM2]], %[[DIM3]]} -// CHECK: %[[RESULT:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref>{%[[DIM4]], %[[DIM5]]} +// CHECK: %[[DIM0:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) : index +// CHECK: %[[DIM1:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) : index +// CHECK: %[[DIM2:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) : index +// CHECK: %[[DIM3:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(3) : index +// CHECK: %[[DIM4:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(4) : index +// CHECK: %[[DIM5:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(5) : index +// CHECK: %[[LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref>{%[[DIM0]], %[[DIM1]]} +// CHECK: %[[RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref>{%[[DIM2]], %[[DIM3]]} +// CHECK: %[[RESULT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) : memref>{%[[DIM4]], %[[DIM5]]} // CHECK-DAG: %[[WGSIZE_X:.+]] = hal.interface.workgroup.size[0] // CHECK-DAG: %[[WGSIZE_Y:.+]] = hal.interface.workgroup.size[1] // CHECK: scf.for %[[IV0:.+]] = {{.+}} { @@ -606,29 +691,39 @@ func.func @bufferize_dynamic_inplace() { // ----- -module { - func.func @reshape_simple() { - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [12], strides = [1] : !flow.dispatch.tensor> -> tensor<12xi32> - %3 = tensor.expand_shape %2 [[0, 1]] output_shape [3, 4] : tensor<12xi32> into tensor<3x4xi32> - flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [3, 4], strides = [1, 1] : tensor<3x4xi32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @reshape_simple() { + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [12], strides = [1] : !flow.dispatch.tensor> -> tensor<12xi32> + %3 = tensor.expand_shape %2 [[0, 1]] output_shape [3, 4] : tensor<12xi32> into tensor<3x4xi32> + flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [3, 4], strides = [1, 1] : tensor<3x4xi32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @reshape_simple() -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) +// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK: %[[RESHAPE:.+]] = memref.expand_shape %[[ARG0]] {{\[}}[0, 1]] // CHECK: linalg.generic {{.*}} ins(%[[RESHAPE]] {{.*}} outs(%[[RET0]] // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<(d0, d1) -> (d0, d1)> module { func.func @reshape_fused_source() { - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 4], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<3x4xi32> %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [12], strides = [1] : !flow.dispatch.tensor> -> tensor<12xi32> %4 = tensor.expand_shape %3 [[0, 1]] output_shape [3, 4] : tensor<12xi32> into tensor<3x4xi32> @@ -642,8 +737,8 @@ module { } } // CHECK-LABEL: func.func @reshape_fused_source() -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<12xi32, #hal.descriptor_type> -// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<3x4xi32, #hal.descriptor_type> +// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<12xi32, #hal.descriptor_type> +// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref<3x4xi32, #hal.descriptor_type> // CHECK: %[[RESHAPE:.+]] = memref.expand_shape %[[ARG0]] {{\[}}[0, 1]] // CHECK: linalg.generic // CHECK-SAME: ins(%[[RESHAPE]] : memref<3x4xi32 @@ -651,29 +746,34 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @reshape_fused_source_and_copyout() { - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 4], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<3x4xi32> - %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [12], strides = [1] : !flow.dispatch.tensor> -> tensor<12xi32> - %5 = tensor.expand_shape %4 [[0, 1]] output_shape [3, 4] : tensor<12xi32> into tensor<3x4xi32> - %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<3x4xi32>) outs(%2 : tensor<3x4xi32>) { - ^bb0(%arg0: i32, %arg1: i32): - %7 = arith.addi %arg0, %arg0 : i32 - linalg.yield %7 : i32 - } -> tensor<3x4xi32> - flow.dispatch.tensor.store %6, %1, offsets = [0, 0], sizes = [3, 4], strides = [1, 1] : tensor<3x4xi32> -> !flow.dispatch.tensor> - flow.dispatch.tensor.store %5, %3, offsets = [0, 0], sizes = [3, 4], strides = [1, 1] : tensor<3x4xi32> -> !flow.dispatch.tensor> - return - } +func.func @reshape_fused_source_and_copyout() { + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 4], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<3x4xi32> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [12], strides = [1] : !flow.dispatch.tensor> -> tensor<12xi32> + %5 = tensor.expand_shape %4 [[0, 1]] output_shape [3, 4] : tensor<12xi32> into tensor<3x4xi32> + %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<3x4xi32>) outs(%2 : tensor<3x4xi32>) { + ^bb0(%arg0: i32, %arg1: i32): + %7 = arith.addi %arg0, %arg0 : i32 + linalg.yield %7 : i32 + } -> tensor<3x4xi32> + flow.dispatch.tensor.store %6, %1, offsets = [0, 0], sizes = [3, 4], strides = [1, 1] : tensor<3x4xi32> -> !flow.dispatch.tensor> + flow.dispatch.tensor.store %5, %3, offsets = [0, 0], sizes = [3, 4], strides = [1, 1] : tensor<3x4xi32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @reshape_fused_source_and_copyout() -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<12xi32, #hal.descriptor_type> -// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<3x4xi32, #hal.descriptor_type> -// CHECK-DAG: %[[RET1:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<3x4xi32, #hal.descriptor_type> +// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<12xi32, #hal.descriptor_type> +// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref<3x4xi32, #hal.descriptor_type> +// CHECK-DAG: %[[RET1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) : memref<3x4xi32, #hal.descriptor_type> // CHECK: %[[RESHAPE:.+]] = memref.expand_shape %[[ARG0]] {{\[}}[0, 1]] // CHECK: linalg.generic // CHECK-SAME: ins(%[[RESHAPE]] : memref<3x4xi32 @@ -682,27 +782,31 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @reshape_fused_target() { - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [12], strides = [1] : !flow.dispatch.tensor> -> tensor<12xi32> - %3 = tensor.expand_shape %2 [[0, 1]] output_shape [3, 4] : tensor<12xi32> into tensor<3x4xi32> - %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [3, 4], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<3x4xi32> - %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<3x4xi32>) outs(%3 : tensor<3x4xi32>) { - ^bb0(%arg0: i32, %arg1: i32): - %7 = arith.addi %arg0, %arg0 : i32 - linalg.yield %7 : i32 - } -> tensor<3x4xi32> - %6 = tensor.collapse_shape %5 [[0, 1]] : tensor<3x4xi32> into tensor<12xi32> - flow.dispatch.tensor.store %6, %1, offsets = [0], sizes = [12], strides = [1] : tensor<12xi32> -> !flow.dispatch.tensor> - return - } +func.func @reshape_fused_target() { + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [12], strides = [1] : !flow.dispatch.tensor> -> tensor<12xi32> + %3 = tensor.expand_shape %2 [[0, 1]] output_shape [3, 4] : tensor<12xi32> into tensor<3x4xi32> + %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [3, 4], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<3x4xi32> + %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<3x4xi32>) outs(%3 : tensor<3x4xi32>) { + ^bb0(%arg0: i32, %arg1: i32): + %7 = arith.addi %arg0, %arg0 : i32 + linalg.yield %7 : i32 + } -> tensor<3x4xi32> + %6 = tensor.collapse_shape %5 [[0, 1]] : tensor<3x4xi32> into tensor<12xi32> + flow.dispatch.tensor.store %6, %1, offsets = [0], sizes = [12], strides = [1] : tensor<12xi32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @reshape_fused_target() -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<3x4xi32, #hal.descriptor_type> -// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<12xi32, #hal.descriptor_type> +// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<3x4xi32, #hal.descriptor_type> +// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref<12xi32, #hal.descriptor_type> // CHECK: %[[RESHAPE:.+]] = memref.expand_shape %[[RET0]] {{\[}}[0, 1]] // CHECK: linalg.generic // CHECK-SAME: ins(%[[ARG0]] : memref<3x4xi32 @@ -710,48 +814,53 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map0 = affine_map<(d0)[s0] -> (-d0 + 1, s0)> #map1 = affine_map<(d0)[s0] -> (-d0 + 3, s0)> -module { - func.func @dot_general_lowering() { - %cst = arith.constant 0.000000e+00 : f32 - %c3 = arith.constant 3 : index - %c1 = arith.constant 1 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 2], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x1x2xf32> - %4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x1x2xf32> into tensor<1x2xf32> - %workgroup_size_x = hal.interface.workgroup.size[0] : index - %workgroup_size_y = hal.interface.workgroup.size[1] : index - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %5 = arith.muli %workgroup_size_y, %workgroup_id_y : index - %6 = arith.muli %workgroup_size_y, %workgroup_count_y : index - scf.for %arg0 = %5 to %c1 step %6 { - %7 = arith.muli %workgroup_size_x, %workgroup_id_x : index - %8 = arith.muli %workgroup_size_x, %workgroup_count_x : index - scf.for %arg1 = %7 to %c3 step %8 { - %9 = affine.min #map0(%arg0)[%workgroup_size_y] - %10 = affine.min #map1(%arg1)[%workgroup_size_x] - %11 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%9, %10], strides = [1, 1] : !flow.dispatch.tensor> -> tensor - %12 = tensor.extract_slice %4[%arg0, 0] [%9, 2] [1, 1] : tensor<1x2xf32> to tensor - %13 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [2, %10], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2x?xf32> - %14 = linalg.fill ins(%cst : f32) outs(%11 : tensor) -> tensor - %15 = linalg.matmul ins(%12, %13 : tensor, tensor<2x?xf32>) outs(%14 : tensor) -> tensor - flow.dispatch.tensor.store %15, %2, offsets = [%arg0, %arg1], sizes = [%9, %10], strides = [%c1, %c1] : tensor -> !flow.dispatch.tensor> - } +func.func @dot_general_lowering() { + %cst = arith.constant 0.000000e+00 : f32 + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 1, 2], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x1x2xf32> + %4 = tensor.collapse_shape %3 [[0, 1], [2]] : tensor<1x1x2xf32> into tensor<1x2xf32> + %workgroup_size_x = hal.interface.workgroup.size[0] : index + %workgroup_size_y = hal.interface.workgroup.size[1] : index + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %5 = arith.muli %workgroup_size_y, %workgroup_id_y : index + %6 = arith.muli %workgroup_size_y, %workgroup_count_y : index + scf.for %arg0 = %5 to %c1 step %6 { + %7 = arith.muli %workgroup_size_x, %workgroup_id_x : index + %8 = arith.muli %workgroup_size_x, %workgroup_count_x : index + scf.for %arg1 = %7 to %c3 step %8 { + %9 = affine.min #map0(%arg0)[%workgroup_size_y] + %10 = affine.min #map1(%arg1)[%workgroup_size_x] + %11 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%9, %10], strides = [1, 1] : !flow.dispatch.tensor> -> tensor + %12 = tensor.extract_slice %4[%arg0, 0] [%9, 2] [1, 1] : tensor<1x2xf32> to tensor + %13 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [2, %10], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2x?xf32> + %14 = linalg.fill ins(%cst : f32) outs(%11 : tensor) -> tensor + %15 = linalg.matmul ins(%12, %13 : tensor, tensor<2x?xf32>) outs(%14 : tensor) -> tensor + flow.dispatch.tensor.store %15, %2, offsets = [%arg0, %arg1], sizes = [%9, %10], strides = [%c1, %c1] : tensor -> !flow.dispatch.tensor> } - return } + return } // CHECK-LABEL: func.func @dot_general_lowering() -// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) +// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[RESHAPE_LHS:.+]] = memref.collapse_shape %[[LHS]] -// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) +// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK: scf.for %[[IV0:.+]] = {{.+}} { // CHECK: scf.for %[[IV1:.+]] = {{.+}} { // CHECK-DAG: %[[LHS_TILE:.+]] = memref.subview %[[RESHAPE_LHS]][%[[IV0]], 0] @@ -765,78 +874,91 @@ module { // ----- -module { - func.func @slice() { - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.constant.load[3] : index - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %7 = tensor.extract_slice %6[%0, %1] [%2, %3] [1, 1] : tensor to tensor - flow.dispatch.tensor.store %7, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%2, %3} - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @slice() { + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %1} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %3} + %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %7 = tensor.extract_slice %6[%0, %1] [%2, %3] [1, 1] : tensor to tensor + flow.dispatch.tensor.store %7, %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%2, %3} + return } // CHECK-LABEL: func.func @slice() -// CHECK-DAG: %[[ARG:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) +// CHECK-DAG: %[[ARG:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[SUBVIEW:.+]] = memref.subview %[[ARG]] // CHECK: linalg.generic {{.*}} ins(%[[SUBVIEW]] {{.*}} outs(%[[RETURN]] // ----- -module { - func.func @slice_rank_reducing() { - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.constant.load[3] : index - %4 = hal.interface.constant.load[4] : index - %5 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%4, %4, %4} - %6 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %7 = flow.dispatch.tensor.load %5, offsets = [0, 0, 0], sizes = [%4, %4, %4], strides = [1, 1, 1] : !flow.dispatch.tensor>{%4, %4, %4} -> tensor - %8 = tensor.extract_slice %7[%0, %0, %1] [%2, 1, %3] [1, 1, 1] : tensor to tensor - flow.dispatch.tensor.store %8, %6, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%2, %3} - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @slice_rank_reducing() { + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : index + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%4, %4, %4} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %3} + %7 = flow.dispatch.tensor.load %5, offsets = [0, 0, 0], sizes = [%4, %4, %4], strides = [1, 1, 1] : !flow.dispatch.tensor>{%4, %4, %4} -> tensor + %8 = tensor.extract_slice %7[%0, %0, %1] [%2, 1, %3] [1, 1, 1] : tensor to tensor + flow.dispatch.tensor.store %8, %6, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%2, %3} + return } // CHECK-LABEL: func.func @slice_rank_reducing() -// CHECK-DAG: %[[ARG:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) +// CHECK-DAG: %[[ARG:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[SUBVIEW:.+]] = memref.subview %[[ARG]] // CHECK: linalg.generic {{.*}} ins(%[[SUBVIEW]] {{.*}} outs(%[[RETURN]] // ----- -module { - func.func @slice_multiple_copy() { - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.constant.load[3] : index - %4 = hal.interface.constant.load[4] : index - %5 = hal.interface.constant.load[5] : index - %6 = hal.interface.constant.load[6] : index - %7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%6, %6, %6} - %8 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%3, %4, %5} - %9 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%3, %5} - %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0], sizes = [%6, %6, %6], strides = [1, 1, 1] : !flow.dispatch.tensor>{%6, %6, %6} -> tensor - %11 = tensor.extract_slice %10[%0, %1, %2] [%3, %4, %5] [1, 1, 1] : tensor to tensor - %12 = tensor.extract_slice %10[%0, %1, %2] [%3, 1, %5] [1, 1, 1] : tensor to tensor - flow.dispatch.tensor.store %11, %8, offsets = [0, 0, 0], sizes = [%3, %4, %5], strides = [1, 1, 1] : tensor -> !flow.dispatch.tensor>{%3, %4, %5} - flow.dispatch.tensor.store %12, %9, offsets = [%0, %2], sizes = [%3, %5], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%3, %5} - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @slice_multiple_copy() { + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : index + %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : index + %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : index + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%6, %6, %6} + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%3, %4, %5} + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%3, %5} + %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0], sizes = [%6, %6, %6], strides = [1, 1, 1] : !flow.dispatch.tensor>{%6, %6, %6} -> tensor + %11 = tensor.extract_slice %10[%0, %1, %2] [%3, %4, %5] [1, 1, 1] : tensor to tensor + %12 = tensor.extract_slice %10[%0, %1, %2] [%3, 1, %5] [1, 1, 1] : tensor to tensor + flow.dispatch.tensor.store %11, %8, offsets = [0, 0, 0], sizes = [%3, %4, %5], strides = [1, 1, 1] : tensor -> !flow.dispatch.tensor>{%3, %4, %5} + flow.dispatch.tensor.store %12, %9, offsets = [%0, %2], sizes = [%3, %5], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%3, %5} + return } // CHECK-LABEL: func.func @slice_multiple_copy() -// CHECK-DAG: %[[ARG:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[RETURN1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[RETURN2:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) -// CHECK-DAG: %[[SIZE1:.+]] = hal.interface.constant.load[3] : index -// CHECK-DAG: %[[SIZE2:.+]] = hal.interface.constant.load[4] : index -// CHECK-DAG: %[[SIZE3:.+]] = hal.interface.constant.load[5] : index +// CHECK-DAG: %[[ARG:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[RETURN1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[RETURN2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) +// CHECK-DAG: %[[SIZE1:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index +// CHECK-DAG: %[[SIZE2:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : index +// CHECK-DAG: %[[SIZE3:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : index // CHECK-DAG: %[[SUBVIEW1:.+]] = memref.subview %[[ARG]][%{{.+}}, %{{.+}}, %{{.+}}] [%[[SIZE1]], %[[SIZE2]], %[[SIZE3]]] // CHECK-DAG: %[[SUBVIEW2:.+]] = memref.subview %[[ARG]][%{{.+}}, %{{.+}}, %{{.+}}] [%[[SIZE1]], 1, %[[SIZE3]]] // CHECK: linalg.generic {{.*}} ins(%[[SUBVIEW1]] {{.*}} outs(%[[RETURN1]] @@ -844,72 +966,84 @@ module { // ----- -module { - func.func @slice_in_place() { - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %3 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - flow.dispatch.tensor.store %3, %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} - return - } +#pipeline_layout = #hal.pipeline.layout + ]> +]> +func.func @slice_in_place() { + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %1} + %3 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + flow.dispatch.tensor.store %3, %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} + return } // CHECK-LABEL: func.func @slice_in_place() // CHECK-NOT: linalg.generic // ----- -module { - func.func @slice_whole_stride_dispatch_0() { - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.constant.load[3] : index - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %7 = tensor.extract_slice %6[1, 0] [1, 4] [1, 1] : tensor to tensor<1x4xi32> - flow.dispatch.tensor.store %7, %5, offsets = [0, 0], sizes = [1, 4], strides = [1, 1] : tensor<1x4xi32> -> !flow.dispatch.tensor>{%2, %3} - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @slice_whole_stride_dispatch_0() { + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %1} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %3} + %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %7 = tensor.extract_slice %6[1, 0] [1, 4] [1, 1] : tensor to tensor<1x4xi32> + flow.dispatch.tensor.store %7, %5, offsets = [0, 0], sizes = [1, 4], strides = [1, 1] : tensor<1x4xi32> -> !flow.dispatch.tensor>{%2, %3} + return } // CHECK-LABEL: func.func @slice_whole_stride_dispatch_0() -// CHECK-DAG: %[[INPUT:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[OUTPUT:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) +// CHECK-DAG: %[[INPUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[OUTPUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[SUB_IN_FIXED:.+]] = memref.subview %[[INPUT]][1, 0] [1, 4] [1, 1] // CHECK-DAG: %[[SUB_OUT_FIXED:.+]] = memref.subview %[[OUTPUT]][0, 0] [1, 4] [1, 1] // CHECK: linalg.generic {{.*}} ins(%[[SUB_IN_FIXED]] {{.*}} outs(%[[SUB_OUT_FIXED]] // ----- -module { - func.func @subtensor_insert() { - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.constant.load[3] : index - %4 = hal.interface.constant.load[4] : index - %5 = hal.interface.constant.load[5] : index - %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %8 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%4, %5} - %9 = flow.dispatch.tensor.load %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %10 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor - %11 = tensor.insert_slice %9 into %10[3, 4] [%0, %1] [1, 1] : tensor into tensor - flow.dispatch.tensor.store %11, %8, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%4, %5} - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @subtensor_insert() { + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : index + %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : index + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %1} + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %3} + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%4, %5} + %9 = flow.dispatch.tensor.load %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %10 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor + %11 = tensor.insert_slice %9 into %10[3, 4] [%0, %1] [1, 1] : tensor into tensor + flow.dispatch.tensor.store %11, %8, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%4, %5} + return } // CHECK-LABEL: func.func @subtensor_insert() -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) -// CHECK-DAG: %[[D0:.+]] = hal.interface.constant.load[0] : index -// CHECK-DAG: %[[D1:.+]] = hal.interface.constant.load[1] : index -// CHECK-DAG: %[[D2:.+]] = hal.interface.constant.load[2] : index -// CHECK-DAG: %[[D3:.+]] = hal.interface.constant.load[3] : index -// CHECK-DAG: %[[D4:.+]] = hal.interface.constant.load[4] : index -// CHECK-DAG: %[[D5:.+]] = hal.interface.constant.load[5] : index +// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) +// CHECK-DAG: %[[D0:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) : index +// CHECK-DAG: %[[D1:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) : index +// CHECK-DAG: %[[D2:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) : index +// CHECK-DAG: %[[D3:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(3) : index +// CHECK-DAG: %[[D4:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(4) : index +// CHECK-DAG: %[[D5:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(5) : index // CHECK-DAG: %[[ALLOC:.+]] = memref.alloc(%[[D2]], %[[D3]]) : memref // CHECK: linalg.generic {{.*}} ins(%[[ARG1]] {{.*}} outs(%[[ALLOC]] // CHECK: %[[SUB_ALLOC:.+]] = memref.subview %[[ALLOC]] @@ -918,21 +1052,25 @@ module { // ----- -module { - func.func @tensor_extract() { - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 9], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<3x9xi32> - %3 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor - %4 = tensor.extract %3[] : tensor - %5 = linalg.fill ins(%4 : i32) outs(%2 : tensor<3x9xi32>) -> tensor<3x9xi32> - flow.dispatch.tensor.store %5, %1, offsets = [0, 0], sizes = [3, 9], strides = [1, 1] : tensor<3x9xi32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @tensor_extract() { + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 9], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<3x9xi32> + %3 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor + %4 = tensor.extract %3[] : tensor + %5 = linalg.fill ins(%4 : i32) outs(%2 : tensor<3x9xi32>) -> tensor<3x9xi32> + flow.dispatch.tensor.store %5, %1, offsets = [0, 0], sizes = [3, 9], strides = [1, 1] : tensor<3x9xi32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @tensor_extract() -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) +// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK: %[[LOAD:.+]] = memref.load %[[ARG0]] // CHECK: linalg.fill // CHECK-SAME: ins(%[[LOAD]] : @@ -940,64 +1078,72 @@ module { // ----- -module { - func.func @load_to_store() { - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 4], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<3x4xi32> - flow.dispatch.tensor.store %2, %0, offsets = [0, 0], sizes = [3, 4], strides = [1, 1] : tensor<3x4xi32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @load_to_store() { + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [3, 4], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<3x4xi32> + flow.dispatch.tensor.store %2, %0, offsets = [0, 0], sizes = [3, 4], strides = [1, 1] : tensor<3x4xi32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @load_to_store() -// CHECK: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<3x4xi32, #hal.descriptor_type> -// CHECK: %[[IN:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<3x4xi32, #hal.descriptor_type> +// CHECK: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<3x4xi32, #hal.descriptor_type> +// CHECK: %[[IN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref<3x4xi32, #hal.descriptor_type> // CHECK: linalg.generic {{.*}} ins(%[[IN]] {{.*}} outs(%[[OUT]] // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map0 = affine_map<()[s0, s1] -> (s0 * s1)> #map1 = affine_map<(d0)[s0] -> (-d0 + 5, s0)> -module { - func.func @rhs_non_splat_constant() { - %cst = arith.constant dense<[[0.706495285, -0.567672312, 0.483717591, 0.522725761, 0.7563259], [-0.0899272263, -0.283501834, -0.350822538, -0.351515919, -0.337136656], [-0.451804549, 0.372324884, -0.620518147, 0.235451385, 0.851095855]]> : tensor<3x5xf32> - %cst_0 = arith.constant 0.000000e+00 : f32 - %c5 = arith.constant 5 : index - %c1 = arith.constant 1 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 5, 3, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x5x3x1xf32> - %3 = tensor.collapse_shape %2 [[0, 1], [2, 3]] : tensor<1x5x3x1xf32> into tensor<5x3xf32> - %workgroup_size_x = hal.interface.workgroup.size[0] : index - %workgroup_size_y = hal.interface.workgroup.size[1] : index - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %4 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y] - %5 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y] - scf.for %arg0 = %4 to %c5 step %5 { - %6 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x] - %7 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x] - scf.for %arg1 = %6 to %c5 step %7 { - %8 = affine.min #map1(%arg0)[%workgroup_size_y] - %9 = affine.min #map1(%arg1)[%workgroup_size_x] - %10 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [%8, %9], strides = [1, 1] : !flow.dispatch.tensor> -> tensor - %11 = tensor.extract_slice %3[%arg0, 0] [%8, 3] [1, 1] : tensor<5x3xf32> to tensor - %12 = tensor.extract_slice %cst[0, %arg1] [3, %9] [1, 1] : tensor<3x5xf32> to tensor<3x?xf32> - %13 = linalg.fill ins(%cst_0 : f32) outs(%10 : tensor) -> tensor - %14 = linalg.matmul ins(%11, %12 : tensor, tensor<3x?xf32>) outs(%13 : tensor) -> tensor - flow.dispatch.tensor.store %14, %1, offsets = [%arg0, %arg1], sizes = [%8, %9], strides = [%c1, %c1] : tensor -> !flow.dispatch.tensor> - } +func.func @rhs_non_splat_constant() { + %cst = arith.constant dense<[[0.706495285, -0.567672312, 0.483717591, 0.522725761, 0.7563259], [-0.0899272263, -0.283501834, -0.350822538, -0.351515919, -0.337136656], [-0.451804549, 0.372324884, -0.620518147, 0.235451385, 0.851095855]]> : tensor<3x5xf32> + %cst_0 = arith.constant 0.000000e+00 : f32 + %c5 = arith.constant 5 : index + %c1 = arith.constant 1 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 5, 3, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x5x3x1xf32> + %3 = tensor.collapse_shape %2 [[0, 1], [2, 3]] : tensor<1x5x3x1xf32> into tensor<5x3xf32> + %workgroup_size_x = hal.interface.workgroup.size[0] : index + %workgroup_size_y = hal.interface.workgroup.size[1] : index + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %4 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y] + %5 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y] + scf.for %arg0 = %4 to %c5 step %5 { + %6 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x] + %7 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x] + scf.for %arg1 = %6 to %c5 step %7 { + %8 = affine.min #map1(%arg0)[%workgroup_size_y] + %9 = affine.min #map1(%arg1)[%workgroup_size_x] + %10 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [%8, %9], strides = [1, 1] : !flow.dispatch.tensor> -> tensor + %11 = tensor.extract_slice %3[%arg0, 0] [%8, 3] [1, 1] : tensor<5x3xf32> to tensor + %12 = tensor.extract_slice %cst[0, %arg1] [3, %9] [1, 1] : tensor<3x5xf32> to tensor<3x?xf32> + %13 = linalg.fill ins(%cst_0 : f32) outs(%10 : tensor) -> tensor + %14 = linalg.matmul ins(%11, %12 : tensor, tensor<3x?xf32>) outs(%13 : tensor) -> tensor + flow.dispatch.tensor.store %14, %1, offsets = [%arg0, %arg1], sizes = [%8, %9], strides = [%c1, %c1] : tensor -> !flow.dispatch.tensor> } - return } + return } // CHECK-LABEL: func.func @rhs_non_splat_constant // CHECK-DAG: %[[CONSTANT:.+]] = arith.constant {{.+}} : tensor<3x5xf32> // CHECK-DAG: %[[RHS:.+]] = bufferization.to_memref %[[CONSTANT]] -// CHECK-DAG: %[[LHS_INPUT:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1x5x3x1xf32, #hal.descriptor_type> -// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<5x5xf32, #hal.descriptor_type> +// CHECK-DAG: %[[LHS_INPUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<1x5x3x1xf32, #hal.descriptor_type> +// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref<5x5xf32, #hal.descriptor_type> // CHECK: %[[LHS:.+]] = memref.collapse_shape %[[LHS_INPUT]] // CHECK: scf.for %[[IV0:.+]] = // CHECK: scf.for %[[IV1:.+]] = @@ -1012,63 +1158,73 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map0 = affine_map<(d0, d1) -> (d0)> #map1 = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @gather() { - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.constant.load[3] : index - %4 = hal.interface.constant.load[4] : index - %5 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %6 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2} - %7 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%3, %4} - %8 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [%3, %4], strides = [1, 1] : !flow.dispatch.tensor>{%3, %4} -> tensor - %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %10 = flow.dispatch.tensor.load %6, offsets = [0], sizes = [%2], strides = [1] : !flow.dispatch.tensor>{%2} -> tensor - %11 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor) outs(%8 : tensor) { - ^bb0(%arg0: i32, %arg1: f32): - %12 = linalg.index 1 : index - %13 = arith.index_cast %arg0 : i32 to index - %14 = tensor.extract %9[%13, %12] : tensor - linalg.yield %14 : f32 - } -> tensor - flow.dispatch.tensor.store %11, %7, offsets = [0, 0], sizes = [%3, %4], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%3, %4} - return - } +func.func @gather() { + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : index + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %1} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2} + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%3, %4} + %8 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [%3, %4], strides = [1, 1] : !flow.dispatch.tensor>{%3, %4} -> tensor + %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %10 = flow.dispatch.tensor.load %6, offsets = [0], sizes = [%2], strides = [1] : !flow.dispatch.tensor>{%2} -> tensor + %11 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor) outs(%8 : tensor) { + ^bb0(%arg0: i32, %arg1: f32): + %12 = linalg.index 1 : index + %13 = arith.index_cast %arg0 : i32 to index + %14 = tensor.extract %9[%13, %12] : tensor + linalg.yield %14 : f32 + } -> tensor + flow.dispatch.tensor.store %11, %7, offsets = [0, 0], sizes = [%3, %4], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%3, %4} + return } // CHECK-LABEL: func.func @gather() -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) +// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK: linalg.generic // CHECK: %[[VAL:.+]] = memref.load %[[ARG0]] // CHECK: linalg.yield %[[VAL]] // ----- -module { - func.func @pooling_nhwc_sum() { - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [1, 2, 2, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x2x2x1xf32> - %4 = bufferization.alloc_tensor() : tensor<2x3xf32> - %5 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor - %6 = tensor.extract %5[] : tensor - %7 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [1, 4, 6, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x4x6x1xf32> - %8 = linalg.fill ins(%6 : f32) outs(%3 : tensor<1x2x2x1xf32>) -> tensor<1x2x2x1xf32> - %9 = linalg.pooling_nhwc_sum {dilations = dense<1> : vector<2xi64>, strides = dense<[2, 3]> : vector<2xi64>} ins(%7, %4 : tensor<1x4x6x1xf32>, tensor<2x3xf32>) outs(%8 : tensor<1x2x2x1xf32>) -> tensor<1x2x2x1xf32> - flow.dispatch.tensor.store %9, %2, offsets = [0, 0, 0, 0], sizes = [1, 2, 2, 1], strides = [1, 1, 1, 1] : tensor<1x2x2x1xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @pooling_nhwc_sum() { + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [1, 2, 2, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x2x2x1xf32> + %4 = bufferization.alloc_tensor() : tensor<2x3xf32> + %5 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor + %6 = tensor.extract %5[] : tensor + %7 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [1, 4, 6, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x4x6x1xf32> + %8 = linalg.fill ins(%6 : f32) outs(%3 : tensor<1x2x2x1xf32>) -> tensor<1x2x2x1xf32> + %9 = linalg.pooling_nhwc_sum {dilations = dense<1> : vector<2xi64>, strides = dense<[2, 3]> : vector<2xi64>} ins(%7, %4 : tensor<1x4x6x1xf32>, tensor<2x3xf32>) outs(%8 : tensor<1x2x2x1xf32>) -> tensor<1x2x2x1xf32> + flow.dispatch.tensor.store %9, %2, offsets = [0, 0, 0, 0], sizes = [1, 2, 2, 1], strides = [1, 1, 1, 1] : tensor<1x2x2x1xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @pooling_nhwc_sum // CHECK-DAG: %[[WINDOW:.+]] = memref.alloc() : memref<2x3xf32> -// CHECK-DAG: %[[INIT:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref> -// CHECK-DAG: %[[INPUT:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<1x4x6x1xf32, #hal.descriptor_type> -// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1x2x2x1xf32, #hal.descriptor_type> +// CHECK-DAG: %[[INIT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref> +// CHECK-DAG: %[[INPUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref<1x4x6x1xf32, #hal.descriptor_type> +// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) : memref<1x2x2x1xf32, #hal.descriptor_type> // CHECK: %[[INIT_VAL:.+]] = memref.load %[[INIT]][] : memref // CHECK: linalg.fill // CHECK-SAME: ins(%[[INIT_VAL]] : @@ -1081,58 +1237,63 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map0 = affine_map<()[s0, s1] -> (s0 * s1)> #map1 = affine_map<(d0)[s0, s1] -> (-d0 + s1, s0)> #map2 = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @read_only_subtensor() { - %c1 = arith.constant 1 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.constant.load[3] : index - %4 = hal.interface.constant.load[4] : index - %5 = hal.interface.constant.load[5] : index - %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %8 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor - %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%4, %5} - %10 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : !flow.dispatch.tensor>{%4, %5} -> tensor - %workgroup_size_x = hal.interface.workgroup.size[0] : index - %workgroup_size_y = hal.interface.workgroup.size[1] : index - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %11 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y] - %12 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y] - scf.for %arg0 = %11 to %2 step %12 { - %13 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x] - %14 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x] - scf.for %arg1 = %13 to %3 step %14 { - %15 = affine.min #map1(%arg0)[%workgroup_size_y, %2] - %16 = affine.min #map1(%arg1)[%workgroup_size_x, %3] - %17 = flow.dispatch.tensor.load %6, offsets = [%arg0, %arg1], sizes = [%15, %16], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %18 = tensor.extract_slice %8[%arg0, %arg1] [%15, %16] [1, 1] : tensor to tensor - %19 = tensor.extract_slice %10[%arg0, %arg1] [%15, %16] [1, 1] : tensor to tensor - %20 = linalg.generic {indexing_maps = [#map2, #map2, #map2, #map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%18, %18, %19, %19 : tensor, tensor, tensor, tensor) outs(%17 : tensor) { - ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): - %21 = arith.mulf %arg4, %arg5 : f32 - %22 = arith.mulf %arg2, %arg3 : f32 - %23 = arith.addf %22, %21 : f32 - %24 = math.sqrt %23 : f32 - linalg.yield %24 : f32 - } -> tensor - flow.dispatch.tensor.store %20, %6, offsets = [%arg0, %arg1], sizes = [%15, %16], strides = [%c1, %c1] : tensor -> !flow.dispatch.tensor>{%0, %1} - } +func.func @read_only_subtensor() { + %c1 = arith.constant 1 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : index + %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : index + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%2, %3} + %8 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%4, %5} + %10 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : !flow.dispatch.tensor>{%4, %5} -> tensor + %workgroup_size_x = hal.interface.workgroup.size[0] : index + %workgroup_size_y = hal.interface.workgroup.size[1] : index + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %11 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y] + %12 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y] + scf.for %arg0 = %11 to %2 step %12 { + %13 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x] + %14 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x] + scf.for %arg1 = %13 to %3 step %14 { + %15 = affine.min #map1(%arg0)[%workgroup_size_y, %2] + %16 = affine.min #map1(%arg1)[%workgroup_size_x, %3] + %17 = flow.dispatch.tensor.load %6, offsets = [%arg0, %arg1], sizes = [%15, %16], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %18 = tensor.extract_slice %8[%arg0, %arg1] [%15, %16] [1, 1] : tensor to tensor + %19 = tensor.extract_slice %10[%arg0, %arg1] [%15, %16] [1, 1] : tensor to tensor + %20 = linalg.generic {indexing_maps = [#map2, #map2, #map2, #map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%18, %18, %19, %19 : tensor, tensor, tensor, tensor) outs(%17 : tensor) { + ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): + %21 = arith.mulf %arg4, %arg5 : f32 + %22 = arith.mulf %arg2, %arg3 : f32 + %23 = arith.addf %22, %21 : f32 + %24 = math.sqrt %23 : f32 + linalg.yield %24 : f32 + } -> tensor + flow.dispatch.tensor.store %20, %6, offsets = [%arg0, %arg1], sizes = [%15, %16], strides = [%c1, %c1] : tensor -> !flow.dispatch.tensor>{%0, %1} } - return } + return } // CHECK-LABEL: func.func @read_only_subtensor -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref> -// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref> -// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref> +// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref> +// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref> +// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) : memref> // CHECK: scf.for // CHECK: scf.for // CHECK-DAG: %[[SV1:.+]] = memref.subview %[[ARG0]] @@ -1144,31 +1305,35 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<(d0) -> (d0)> -module { - func.func @reshape_read_only() { - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2} - %5 = flow.dispatch.tensor.load %4, offsets = [0], sizes = [%2], strides = [1] : !flow.dispatch.tensor>{%2} -> tensor - %6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %7 = tensor.collapse_shape %6 [[0, 1]] : tensor into tensor - %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} - ins(%7 : tensor) - outs(%5 : tensor) { - ^bb0(%arg0: f32, %arg1: f32): - %9 = arith.addf %arg0, %arg0 : f32 - linalg.yield %9 : f32 - } -> tensor - flow.dispatch.tensor.store %8, %4, offsets = [0], sizes = [%2], strides = [1] : tensor -> !flow.dispatch.tensor>{%2} - return - } +func.func @reshape_read_only() { + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %1} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2} + %5 = flow.dispatch.tensor.load %4, offsets = [0], sizes = [%2], strides = [1] : !flow.dispatch.tensor>{%2} -> tensor + %6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %7 = tensor.collapse_shape %6 [[0, 1]] : tensor into tensor + %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} + ins(%7 : tensor) + outs(%5 : tensor) { + ^bb0(%arg0: f32, %arg1: f32): + %9 = arith.addf %arg0, %arg0 : f32 + linalg.yield %9 : f32 + } -> tensor + flow.dispatch.tensor.store %8, %4, offsets = [0], sizes = [%2], strides = [1] : tensor -> !flow.dispatch.tensor>{%2} + return } // CHECK-LABEL: func.func @reshape_read_only -// CHECK-DAG: %[[INPUT:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[OUTPUT:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) +// CHECK-DAG: %[[INPUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[OUTPUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK: %[[RESHAPE:.+]] = memref.collapse_shape %[[INPUT]] // CHECK: linalg.generic // CHECK-SAME: ins(%[[RESHAPE]] : memref, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map0 = affine_map<(d0, d1, d2, d3) -> (d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -module { - func.func @use_buffer_for_operand_when_output_tensor_not_used() { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x112x112x32xf32> - %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 255, 255, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x225x225x16xf32> - %6 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 16, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x16x32xf32> - %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xf32> - %8 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> - %9 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%5, %6 : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>) outs(%8 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> - %10 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<32xf32>) outs(%9 : tensor<1x112x112x32xf32>) { - ^bb0(%arg0: f32, %arg1: f32): - %11 = arith.subf %arg1, %arg0 : f32 - linalg.yield %11 : f32 - } -> tensor<1x112x112x32xf32> - flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : tensor<1x112x112x32xf32> -> !flow.dispatch.tensor> - return - } +func.func @use_buffer_for_operand_when_output_tensor_not_used() { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x112x112x32xf32> + %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 255, 255, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x225x225x16xf32> + %6 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 16, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x16x32xf32> + %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xf32> + %8 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> + %9 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%5, %6 : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>) outs(%8 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> + %10 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<32xf32>) outs(%9 : tensor<1x112x112x32xf32>) { + ^bb0(%arg0: f32, %arg1: f32): + %11 = arith.subf %arg1, %arg0 : f32 + linalg.yield %11 : f32 + } -> tensor<1x112x112x32xf32> + flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : tensor<1x112x112x32xf32> -> !flow.dispatch.tensor> + return } // CHECK: func.func @use_buffer_for_operand_when_output_tensor_not_used() // CHECK-NOT: memref.alloc -// CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) +// CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) // CHECK: linalg.fill // CHECK-SAME: outs(%[[OUTPUT]] : // CHECK-NEXT: linalg.conv_2d_nhwc_hwcf @@ -1214,37 +1385,43 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d3)> -module { - func.func @dont_use_buffer_for_operand_when_output_tensor_used() { - %cst = arith.constant 1.000000e+00 : f32 - %cst_0 = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x112x112x32xf32> - %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x225x225x16xf32> - %6 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 16, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x16x32xf32> - %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xf32> - %8 = bufferization.alloc_tensor() : tensor<1x112x112x32xf32> - %9 = linalg.fill ins(%cst_0 : f32) outs(%8 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> - %10 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%5, %6 : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>) outs(%9 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> - %11 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> - %12 = linalg.generic {indexing_maps = [#map0, #map1, #map0], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %7 : tensor<1x112x112x32xf32>, tensor<32xf32>) outs(%11 : tensor<1x112x112x32xf32>) { - ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): - %13 = arith.subf %arg0, %arg1 : f32 - %14 = arith.addf %13, %arg2 : f32 - linalg.yield %14 : f32 - } -> tensor<1x112x112x32xf32> - flow.dispatch.tensor.store %12, %3, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : tensor<1x112x112x32xf32> -> !flow.dispatch.tensor> - return - } +func.func @dont_use_buffer_for_operand_when_output_tensor_used() { + %cst = arith.constant 1.000000e+00 : f32 + %cst_0 = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x112x112x32xf32> + %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x225x225x16xf32> + %6 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 16, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x16x32xf32> + %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xf32> + %8 = bufferization.alloc_tensor() : tensor<1x112x112x32xf32> + %9 = linalg.fill ins(%cst_0 : f32) outs(%8 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> + %10 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%5, %6 : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>) outs(%9 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> + %11 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> + %12 = linalg.generic {indexing_maps = [#map0, #map1, #map0], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %7 : tensor<1x112x112x32xf32>, tensor<32xf32>) outs(%11 : tensor<1x112x112x32xf32>) { + ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): + %13 = arith.subf %arg0, %arg1 : f32 + %14 = arith.addf %13, %arg2 : f32 + linalg.yield %14 : f32 + } -> tensor<1x112x112x32xf32> + flow.dispatch.tensor.store %12, %3, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : tensor<1x112x112x32xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @dont_use_buffer_for_operand_when_output_tensor_used() // CHECK-DAG: %[[ALLOC:.+]] = memref.alloc -// CHECK-DAG: %[[OUTPUT:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) +// CHECK-DAG: %[[OUTPUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) // CHECK: linalg.fill // CHECK-SAME: outs(%[[ALLOC]] : // CHECK-NEXT: linalg.conv_2d_nhwc_hwcf @@ -1257,86 +1434,95 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map0 = affine_map<(d0) -> (-d0 + 4)> #map1 = affine_map<(d0) -> (d0)> #map2 = affine_map<(d0) -> ()> -module { - func.func @bufferize_cst_output_tensor() { - %c-2147483648_i32 = arith.constant -2147483648 : i32 - %cst = arith.constant 0.000000e+00 : f32 - %cst_0 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor - %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor> -> tensor<5xf32> - %4 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%2 : tensor) -> tensor - %5 = linalg.generic {indexing_maps = [#map0, #map1, #map2], iterator_types = ["reduction"]} ins(%3, %cst_0 : tensor<5xf32>, tensor<5xi32>) outs(%4 : tensor) { - ^bb0(%arg0: f32, %arg1: i32, %arg2: i32): - %6 = arith.cmpf oeq, %arg0, %cst : f32 - %7 = arith.extui %6 : i1 to i32 - %8 = arith.muli %7, %arg1 : i32 - %9 = arith.cmpi sgt, %8, %arg2 : i32 - %10 = arith.select %9, %8, %arg2 : i32 - linalg.yield %10 : i32 - } -> tensor - flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> - return - } +func.func @bufferize_cst_output_tensor() { + %c-2147483648_i32 = arith.constant -2147483648 : i32 + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor + %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [5], strides = [1] : !flow.dispatch.tensor> -> tensor<5xf32> + %4 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%2 : tensor) -> tensor + %5 = linalg.generic {indexing_maps = [#map0, #map1, #map2], iterator_types = ["reduction"]} ins(%3, %cst_0 : tensor<5xf32>, tensor<5xi32>) outs(%4 : tensor) { + ^bb0(%arg0: f32, %arg1: i32, %arg2: i32): + %6 = arith.cmpf oeq, %arg0, %cst : f32 + %7 = arith.extui %6 : i1 to i32 + %8 = arith.muli %7, %arg1 : i32 + %9 = arith.cmpi sgt, %8, %arg2 : i32 + %10 = arith.select %9, %8, %arg2 : i32 + linalg.yield %10 : i32 + } -> tensor + flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @bufferize_cst_output_tensor() // CHECK-DAG: %[[CST1:.+]] = arith.constant -2147483648 : i32 // CHECK-DAG: %[[CST5:.+]] = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32> // CHECK: %[[CAST5:.+]] = bufferization.to_memref %[[CST5]] : memref<5xi32> -// CHECK: %[[INPUT:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<5xf32, #hal.descriptor_type> -// CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref> +// CHECK: %[[INPUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<5xf32, #hal.descriptor_type> +// CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref> // CHECK: linalg.fill ins(%[[CST1]] : i32) outs(%[[OUTPUT]] : memref) // CHECK: linalg.generic // CHECK-SAME: ins(%[[INPUT]], %[[CAST5]] : {{.*}}) outs(%[[OUTPUT]] : memref) // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<()[s0] -> (s0 * 32)> -module { - func.func @cast_follwed_by_store() { - %cst = arith.constant 0.000000e+00 : f32 - %c4 = arith.constant 4 : index - %c64 = arith.constant 64 : index - %c1 = arith.constant 1 : index - %c32 = arith.constant 32 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %workgroup_id_z = hal.interface.workgroup.id[2] : index - %workgroup_count_z = hal.interface.workgroup.count[2] : index - scf.for %arg0 = %workgroup_id_z to %c4 step %workgroup_count_z { - %3 = affine.apply #map()[%workgroup_id_y] - %4 = affine.apply #map()[%workgroup_count_y] - scf.for %arg1 = %3 to %c32 step %4 { - %5 = affine.apply #map()[%workgroup_id_x] - %6 = affine.apply #map()[%workgroup_count_x] - scf.for %arg2 = %5 to %c64 step %6 { - %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, 32, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x32x32xf32> - %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [1, 32, 1024], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x32x1024xf32> - %9 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [1, 1024, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x1024x32xf32> - %10 = linalg.fill ins(%cst : f32) outs(%7 : tensor<1x32x32xf32>) -> tensor<1x32x32xf32> - %11 = linalg.batch_matmul ins(%8, %9 : tensor<1x32x1024xf32>, tensor<1x1024x32xf32>) outs(%10 : tensor<1x32x32xf32>) -> tensor<1x32x32xf32> - flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, 32, 32], strides = [1, 1, 1] : tensor<1x32x32xf32> -> !flow.dispatch.tensor> - } +func.func @cast_follwed_by_store() { + %cst = arith.constant 0.000000e+00 : f32 + %c4 = arith.constant 4 : index + %c64 = arith.constant 64 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %workgroup_id_z = hal.interface.workgroup.id[2] : index + %workgroup_count_z = hal.interface.workgroup.count[2] : index + scf.for %arg0 = %workgroup_id_z to %c4 step %workgroup_count_z { + %3 = affine.apply #map()[%workgroup_id_y] + %4 = affine.apply #map()[%workgroup_count_y] + scf.for %arg1 = %3 to %c32 step %4 { + %5 = affine.apply #map()[%workgroup_id_x] + %6 = affine.apply #map()[%workgroup_count_x] + scf.for %arg2 = %5 to %c64 step %6 { + %7 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, 32, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x32x32xf32> + %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [1, 32, 1024], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x32x1024xf32> + %9 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [1, 1024, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x1024x32xf32> + %10 = linalg.fill ins(%cst : f32) outs(%7 : tensor<1x32x32xf32>) -> tensor<1x32x32xf32> + %11 = linalg.batch_matmul ins(%8, %9 : tensor<1x32x1024xf32>, tensor<1x1024x32xf32>) outs(%10 : tensor<1x32x32xf32>) -> tensor<1x32x32xf32> + flow.dispatch.tensor.store %11, %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, 32, 32], strides = [1, 1, 1] : tensor<1x32x32xf32> -> !flow.dispatch.tensor> } } - return } + return } // CHECK-LABEL: func.func @cast_follwed_by_store() // CHECK-DAG: %[[ZERO:.+]] = arith.constant 0.000000e+00 : f32 -// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x32x1024xf32, #hal.descriptor_type> -// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<4x1024x64xf32, #hal.descriptor_type> -// CHECK-DAG: %[[RESULT:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x32x64xf32, #hal.descriptor_type> +// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<4x32x1024xf32, #hal.descriptor_type> +// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref<4x1024x64xf32, #hal.descriptor_type> +// CHECK-DAG: %[[RESULT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) : memref<4x32x64xf32, #hal.descriptor_type> // CHECK-DAG: %[[LHSV:.+]] = memref.subview %[[LHS]] // CHECK-DAG: %[[RHSV:.+]] = memref.subview %[[RHS]] // CHECK-DAG: %[[RESULTV:.+]] = memref.subview %[[RESULT]] @@ -1347,30 +1533,41 @@ module { // ----- -module { - func.func @rank_reduced_subtensor_insert() { - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.constant.load[3] : index - %4 = hal.interface.constant.load[4] : index - %5 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %6 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3, %4} - %7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %8 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0], sizes = [%2, %3, %4], strides = [1, 1, 1] : !flow.dispatch.tensor>{%2, %3, %4} -> tensor - %9 = tensor.insert_slice %7 into %8[0, 0, 0] [1, %3, %4] [1, 1, 1] : tensor into tensor - flow.dispatch.tensor.store %9, %6, offsets = [0, 0, 0], sizes = [%2, %3, %4], strides = [1, 1, 1] : tensor -> !flow.dispatch.tensor>{%2, %3, %4} - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @rank_reduced_subtensor_insert() { + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : index + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %1} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %3, %4} + %7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %8 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0], sizes = [%2, %3, %4], strides = [1, 1, 1] : !flow.dispatch.tensor>{%2, %3, %4} -> tensor + %9 = tensor.insert_slice %7 into %8[0, 0, 0] [1, %3, %4] [1, 1, 1] : tensor into tensor + flow.dispatch.tensor.store %9, %6, offsets = [0, 0, 0], sizes = [%2, %3, %4], strides = [1, 1, 1] : tensor -> !flow.dispatch.tensor>{%2, %3, %4} + return } // CHECK-LABEL: func.func @rank_reduced_subtensor_insert() -// CHECK-DAG: %[[ARG:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[RET:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) +// CHECK-DAG: %[[ARG:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[RET:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK: %[[SUBVIEW:.+]] = memref.subview %[[RET]] // CHECK: linalg.generic {{.*}} ins(%[[ARG]] {{.*}} outs(%[[SUBVIEW]] // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map0 = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -1380,9 +1577,9 @@ func.func @bufferize_transfer_op_inplace() { %c0 = arith.constant 0 : index %c2 = arith.constant 2 : index %c1 = arith.constant 1 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %4 = flow.dispatch.tensor.load %0, offsets = [%c0, %c0], sizes = [2, 3], strides = [%c1, %c1] : !flow.dispatch.tensor> -> tensor<2x3xf32> %5 = flow.dispatch.tensor.load %1, offsets = [%c0, %c0], sizes = [3, 1], strides = [%c1, %c1] : !flow.dispatch.tensor> -> tensor<3x1xf32> %6 = flow.dispatch.tensor.load %3, offsets = [%c0, %c0], sizes = [2, 1], strides = [%c1, %c1] : !flow.dispatch.tensor> -> tensor<2x1xf32> @@ -1410,9 +1607,9 @@ func.func @bufferize_transfer_op_inplace() { } // CHECK-LABEL: func.func @bufferize_transfer_op_inplace() -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) +// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-DAG: %[[ARG1V:.+]] = memref.subview %[[ARG1]] // CHECK-DAG: %[[RET0V:.+]] = memref.subview %[[RET0]] // CHECK-COUNT-6: vector.transfer_read %[[ARG0]] @@ -1424,61 +1621,67 @@ func.func @bufferize_transfer_op_inplace() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map0 = affine_map<(d0)[s0, s1] -> (-d0 + s0, s1)> #map1 = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @multi_result() { - %c1 = arith.constant 1 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.constant.load[3] : index - %4 = hal.interface.constant.load[4] : index - %5 = hal.interface.constant.load[5] : index - %6 = hal.interface.constant.load[6] : index - %7 = hal.interface.constant.load[7] : index - %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %10 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%4, %5} - %11 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor>{%6, %7} - %12 = hal.interface.constant.load[8] : index - %13 = hal.interface.constant.load[9] : index - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %workgroup_size_x = hal.interface.workgroup.size[0] : index - %workgroup_size_y = hal.interface.workgroup.size[1] : index - %14 = arith.muli %workgroup_id_y, %workgroup_size_y : index - %15 = arith.muli %workgroup_count_y, %workgroup_size_y : index - %16 = arith.muli %workgroup_id_x, %workgroup_size_x : index - %17 = arith.muli %workgroup_count_x, %workgroup_size_x : index - scf.for %arg0 = %14 to %12 step %15 { - scf.for %arg1 = %16 to %13 step %17 { - %18 = affine.min #map0(%arg0)[%12, %workgroup_size_y] - %19 = affine.min #map0(%arg1)[%13, %workgroup_size_x] - %20 = flow.dispatch.tensor.load %11, offsets = [%arg0, %arg1], sizes = [%18, %19], strides = [1, 1] : !flow.dispatch.tensor>{%6, %7} -> tensor - %21 = flow.dispatch.tensor.load %10, offsets = [%arg0, %arg1], sizes = [%18, %19], strides = [1, 1] : !flow.dispatch.tensor>{%4, %5} -> tensor - %22 = flow.dispatch.tensor.load %8, offsets = [%arg0, %arg1], sizes = [%18, %19], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %23 = flow.dispatch.tensor.load %9, offsets = [%arg0, %arg1], sizes = [%18, %19], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor - %24:2 = linalg.generic {indexing_maps = [#map1, #map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%22, %23 : tensor, tensor) outs(%21, %20 : tensor, tensor) { - ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32): - %25 = arith.mulf %arg2, %arg3 : f32 - %26 = arith.addf %arg2, %arg3 : f32 - linalg.yield %25, %26 : f32, f32 - } -> (tensor, tensor) - flow.dispatch.tensor.store %24#0, %10, offsets = [%arg0, %arg1], sizes = [%18, %19], strides = [%c1, %c1] : tensor -> !flow.dispatch.tensor>{%4, %5} - flow.dispatch.tensor.store %24#1, %11, offsets = [%arg0, %arg1], sizes = [%18, %19], strides = [%c1, %c1] : tensor -> !flow.dispatch.tensor>{%6, %7} - } +func.func @multi_result() { + %c1 = arith.constant 1 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : index + %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : index + %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : index + %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : index + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %1} + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %3} + %10 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%4, %5} + %11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%6, %7} + %12 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : index + %13 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : index + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %workgroup_size_x = hal.interface.workgroup.size[0] : index + %workgroup_size_y = hal.interface.workgroup.size[1] : index + %14 = arith.muli %workgroup_id_y, %workgroup_size_y : index + %15 = arith.muli %workgroup_count_y, %workgroup_size_y : index + %16 = arith.muli %workgroup_id_x, %workgroup_size_x : index + %17 = arith.muli %workgroup_count_x, %workgroup_size_x : index + scf.for %arg0 = %14 to %12 step %15 { + scf.for %arg1 = %16 to %13 step %17 { + %18 = affine.min #map0(%arg0)[%12, %workgroup_size_y] + %19 = affine.min #map0(%arg1)[%13, %workgroup_size_x] + %20 = flow.dispatch.tensor.load %11, offsets = [%arg0, %arg1], sizes = [%18, %19], strides = [1, 1] : !flow.dispatch.tensor>{%6, %7} -> tensor + %21 = flow.dispatch.tensor.load %10, offsets = [%arg0, %arg1], sizes = [%18, %19], strides = [1, 1] : !flow.dispatch.tensor>{%4, %5} -> tensor + %22 = flow.dispatch.tensor.load %8, offsets = [%arg0, %arg1], sizes = [%18, %19], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %23 = flow.dispatch.tensor.load %9, offsets = [%arg0, %arg1], sizes = [%18, %19], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor + %24:2 = linalg.generic {indexing_maps = [#map1, #map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%22, %23 : tensor, tensor) outs(%21, %20 : tensor, tensor) { + ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32): + %25 = arith.mulf %arg2, %arg3 : f32 + %26 = arith.addf %arg2, %arg3 : f32 + linalg.yield %25, %26 : f32, f32 + } -> (tensor, tensor) + flow.dispatch.tensor.store %24#0, %10, offsets = [%arg0, %arg1], sizes = [%18, %19], strides = [%c1, %c1] : tensor -> !flow.dispatch.tensor>{%4, %5} + flow.dispatch.tensor.store %24#1, %11, offsets = [%arg0, %arg1], sizes = [%18, %19], strides = [%c1, %c1] : tensor -> !flow.dispatch.tensor>{%6, %7} } - return } + return } // CHECK-LABEL: func.func @multi_result() -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) -// CHECK-DAG: %[[RET1:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) +// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) +// CHECK-DAG: %[[RET1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) // CHECK-DAG: %[[ARG0V:.+]] = memref.subview %[[ARG0]] // CHECK-DAG: %[[ARG1V:.+]] = memref.subview %[[ARG1]] // CHECK-DAG: %[[RET0V:.+]] = memref.subview %[[RET0]] @@ -1489,55 +1692,61 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map0 = affine_map<()[s0] -> (s0 * 128)> #map1 = affine_map<(d0)[s0] -> (-d0 + s0, 128)> #map2 = affine_map<(d0, d1) -> (d1, d0)> #map3 = affine_map<(d0, d1) -> (d0)> -module { - func.func @multi_result_reduce() { - %c0_i32 = arith.constant 0 : i32 - %c-2147483648_i32 = arith.constant -2147483648 : i32 - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%2} - %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor>{%2} - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %7 = affine.apply #map0()[%workgroup_id_x] - %8 = affine.apply #map0()[%workgroup_count_x] - scf.for %arg0 = %7 to %1 step %8 { - %9 = affine.min #map1(%arg0)[%1] - %10 = flow.dispatch.tensor.load %6, offsets = [%arg0], sizes = [%9], strides = [1] : !flow.dispatch.tensor>{%2} -> tensor - %11 = flow.dispatch.tensor.load %5, offsets = [%arg0], sizes = [%9], strides = [1] : !flow.dispatch.tensor>{%2} -> tensor - %12 = flow.dispatch.tensor.load %3, offsets = [0, %arg0], sizes = [%0, %9], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %13 = flow.dispatch.tensor.load %4, offsets = [0, %arg0], sizes = [%0, %9], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %14 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%11 : tensor) -> tensor - %15 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor) -> tensor - %16:2 = linalg.generic {indexing_maps = [#map2, #map2, #map3, #map3], iterator_types = ["parallel", "reduction"]} ins(%12, %13 : tensor, tensor) outs(%14, %15 : tensor, tensor) { - ^bb0(%arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32): - %17 = arith.cmpi sge, %arg1, %arg3 : i32 - %18 = arith.select %17, %arg1, %arg3 : i32 - %19 = arith.cmpi eq, %arg1, %arg3 : i32 - %20 = arith.cmpi slt, %arg2, %arg4 : i32 - %21 = arith.select %20, %arg2, %arg4 : i32 - %22 = arith.select %17, %arg2, %arg4 : i32 - %23 = arith.select %19, %21, %22 : i32 - linalg.yield %18, %23 : i32, i32 - } -> (tensor, tensor) - flow.dispatch.tensor.store %16#0, %5, offsets = [%arg0], sizes = [%9], strides = [1] : tensor -> !flow.dispatch.tensor>{%2} - flow.dispatch.tensor.store %16#1, %6, offsets = [%arg0], sizes = [%9], strides = [1] : tensor -> !flow.dispatch.tensor>{%2} - } - return +func.func @multi_result_reduce() { + %c0_i32 = arith.constant 0 : i32 + %c-2147483648_i32 = arith.constant -2147483648 : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %1} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%0, %1} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%2} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%2} + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %7 = affine.apply #map0()[%workgroup_id_x] + %8 = affine.apply #map0()[%workgroup_count_x] + scf.for %arg0 = %7 to %1 step %8 { + %9 = affine.min #map1(%arg0)[%1] + %10 = flow.dispatch.tensor.load %6, offsets = [%arg0], sizes = [%9], strides = [1] : !flow.dispatch.tensor>{%2} -> tensor + %11 = flow.dispatch.tensor.load %5, offsets = [%arg0], sizes = [%9], strides = [1] : !flow.dispatch.tensor>{%2} -> tensor + %12 = flow.dispatch.tensor.load %3, offsets = [0, %arg0], sizes = [%0, %9], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %13 = flow.dispatch.tensor.load %4, offsets = [0, %arg0], sizes = [%0, %9], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %14 = linalg.fill ins(%c-2147483648_i32 : i32) outs(%11 : tensor) -> tensor + %15 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor) -> tensor + %16:2 = linalg.generic {indexing_maps = [#map2, #map2, #map3, #map3], iterator_types = ["parallel", "reduction"]} ins(%12, %13 : tensor, tensor) outs(%14, %15 : tensor, tensor) { + ^bb0(%arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32): + %17 = arith.cmpi sge, %arg1, %arg3 : i32 + %18 = arith.select %17, %arg1, %arg3 : i32 + %19 = arith.cmpi eq, %arg1, %arg3 : i32 + %20 = arith.cmpi slt, %arg2, %arg4 : i32 + %21 = arith.select %20, %arg2, %arg4 : i32 + %22 = arith.select %17, %arg2, %arg4 : i32 + %23 = arith.select %19, %21, %22 : i32 + linalg.yield %18, %23 : i32, i32 + } -> (tensor, tensor) + flow.dispatch.tensor.store %16#0, %5, offsets = [%arg0], sizes = [%9], strides = [1] : tensor -> !flow.dispatch.tensor>{%2} + flow.dispatch.tensor.store %16#1, %6, offsets = [%arg0], sizes = [%9], strides = [1] : tensor -> !flow.dispatch.tensor>{%2} } + return } // CHECK-LABEL: func.func @multi_result_reduce -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) -// CHECK-DAG: %[[RET1:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) +// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) +// CHECK-DAG: %[[RET1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) // CHECK: scf.for // CHECK-DAG: %[[ARG0_SV:.+]] = memref.subview %[[ARG0]] // CHECK-DAG: %[[ARG1_SV:.+]] = memref.subview %[[ARG1]] @@ -1553,6 +1762,13 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map0 = affine_map<()[s0] -> (s0 * 64)> #map1 = affine_map<(d0) -> (-d0 + 250, 64)> #map2 = affine_map<(d0) -> (-d0 + 370, 64)> @@ -1560,59 +1776,57 @@ module { #map4 = affine_map<(d0) -> (-d0 + 144, 24)> #map5 = affine_map<(d0) -> (-d0 + 370, 32)> #map6 = affine_map<(d0, d1) -> (32, d0 - d1)> -module { - func.func @l1_tiled_matmul_no_fill_readwrite() { - %c32 = arith.constant 32 : index - %c24 = arith.constant 24 : index - %c144 = arith.constant 144 : index - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c250 = arith.constant 250 : index - %c370 = arith.constant 370 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %3 = affine.apply #map0()[%workgroup_id_y] - %4 = affine.apply #map0()[%workgroup_count_y] - scf.for %arg0 = %3 to %c250 step %4 { - %5 = affine.apply #map0()[%workgroup_id_x] - %6 = affine.apply #map0()[%workgroup_count_x] - scf.for %arg1 = %5 to %c370 step %6 { - %7 = affine.min #map1(%arg0) - %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 144], strides = [1, 1] : !flow.dispatch.tensor> -> tensor - %9 = affine.min #map2(%arg1) - %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [144, %9], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<144x?xf32> - %11 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : !flow.dispatch.tensor> -> tensor - %12 = scf.for %arg2 = %c0 to %c250 step %c32 iter_args(%arg3 = %11) -> (tensor) { - %13 = scf.for %arg4 = %c0 to %c370 step %c32 iter_args(%arg5 = %arg3) -> (tensor) { - %14 = scf.for %arg6 = %c0 to %c144 step %c24 iter_args(%arg7 = %arg5) -> (tensor) { - %15 = affine.min #map3(%arg2) - %16 = affine.min #map4(%arg6) - %17 = tensor.extract_slice %8[%arg2, %arg6] [%15, %16] [1, 1] : tensor to tensor - %18 = affine.min #map5(%arg4) - %19 = tensor.extract_slice %10[%arg6, %arg4] [%16, %18] [1, 1] : tensor<144x?xf32> to tensor - %20 = tensor.dim %arg7, %c0 : tensor - %21 = affine.min #map6(%20, %arg2) - %22 = tensor.dim %arg7, %c1 : tensor - %23 = affine.min #map6(%22, %arg4) - %24 = tensor.extract_slice %arg7[%arg2, %arg4] [%21, %23] [1, 1] : tensor to tensor - %25 = linalg.matmul ins(%17, %19 : tensor, tensor) outs(%24 : tensor) -> tensor - %26 = tensor.insert_slice %25 into %arg7[%arg2, %arg4] [%21, %23] [1, 1] : tensor into tensor - scf.yield %26 : tensor - } - scf.yield %14 : tensor +func.func @l1_tiled_matmul_no_fill_readwrite() { + %c32 = arith.constant 32 : index + %c24 = arith.constant 24 : index + %c144 = arith.constant 144 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c250 = arith.constant 250 : index + %c370 = arith.constant 370 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %3 = affine.apply #map0()[%workgroup_id_y] + %4 = affine.apply #map0()[%workgroup_count_y] + scf.for %arg0 = %3 to %c250 step %4 { + %5 = affine.apply #map0()[%workgroup_id_x] + %6 = affine.apply #map0()[%workgroup_count_x] + scf.for %arg1 = %5 to %c370 step %6 { + %7 = affine.min #map1(%arg0) + %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 144], strides = [1, 1] : !flow.dispatch.tensor> -> tensor + %9 = affine.min #map2(%arg1) + %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [144, %9], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<144x?xf32> + %11 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : !flow.dispatch.tensor> -> tensor + %12 = scf.for %arg2 = %c0 to %c250 step %c32 iter_args(%arg3 = %11) -> (tensor) { + %13 = scf.for %arg4 = %c0 to %c370 step %c32 iter_args(%arg5 = %arg3) -> (tensor) { + %14 = scf.for %arg6 = %c0 to %c144 step %c24 iter_args(%arg7 = %arg5) -> (tensor) { + %15 = affine.min #map3(%arg2) + %16 = affine.min #map4(%arg6) + %17 = tensor.extract_slice %8[%arg2, %arg6] [%15, %16] [1, 1] : tensor to tensor + %18 = affine.min #map5(%arg4) + %19 = tensor.extract_slice %10[%arg6, %arg4] [%16, %18] [1, 1] : tensor<144x?xf32> to tensor + %20 = tensor.dim %arg7, %c0 : tensor + %21 = affine.min #map6(%20, %arg2) + %22 = tensor.dim %arg7, %c1 : tensor + %23 = affine.min #map6(%22, %arg4) + %24 = tensor.extract_slice %arg7[%arg2, %arg4] [%21, %23] [1, 1] : tensor to tensor + %25 = linalg.matmul ins(%17, %19 : tensor, tensor) outs(%24 : tensor) -> tensor + %26 = tensor.insert_slice %25 into %arg7[%arg2, %arg4] [%21, %23] [1, 1] : tensor into tensor + scf.yield %26 : tensor } - scf.yield %13 : tensor + scf.yield %14 : tensor } - flow.dispatch.tensor.store %12, %2, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : tensor -> !flow.dispatch.tensor> + scf.yield %13 : tensor } + flow.dispatch.tensor.store %12, %2, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : tensor -> !flow.dispatch.tensor> } - return } + return } // CHECK-LABEL: l1_tiled_matmul_no_fill_readwrite // CHECK-DAG: %[[M:.+]] = arith.constant 250 : index @@ -1620,9 +1834,9 @@ module { // CHECK-DAG: %[[K:.+]] = arith.constant 144 : index // CHECK-DAG: %[[L1_MN_SIZE:.+]] = arith.constant 32 : index // CHECK-DAG: %[[L1_K_SIZE:.+]] = arith.constant 24 : index -// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<250x144xf32, #hal.descriptor_type> -// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<144x370xf32, #hal.descriptor_type> -// CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<250x370xf32, #hal.descriptor_type> +// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<250x144xf32, #hal.descriptor_type> +// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref<144x370xf32, #hal.descriptor_type> +// CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) : memref<250x370xf32, #hal.descriptor_type> // CHECK: scf.for %[[WORKGROUP_I:.+]] = %{{.*}} to %[[M]] step %{{.*}} { // CHECK: scf.for %[[WORKGROUP_J:.+]] = %{{.*}} to %[[N]] step %{{.*}} { // CHECK-DAG: %[[WORKGROUP_I_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_I]]) @@ -1644,6 +1858,13 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map0 = affine_map<()[s0] -> (s0 * 64)> #map1 = affine_map<(d0) -> (-d0 + 250, 64)> #map2 = affine_map<(d0) -> (-d0 + 370, 64)> @@ -1651,61 +1872,59 @@ module { #map4 = affine_map<(d0) -> (-d0 + 144, 24)> #map5 = affine_map<(d0) -> (-d0 + 370, 32)> #map6 = affine_map<(d0, d1) -> (32, d0 - d1)> -module { - func.func @l1_tiled_matmul() { - %cst = arith.constant 0.000000e+00 : f32 - %c32 = arith.constant 32 : index - %c24 = arith.constant 24 : index - %c144 = arith.constant 144 : index - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c250 = arith.constant 250 : index - %c370 = arith.constant 370 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %3 = affine.apply #map0()[%workgroup_id_y] - %4 = affine.apply #map0()[%workgroup_count_y] - scf.for %arg0 = %3 to %c250 step %4 { - %5 = affine.apply #map0()[%workgroup_id_x] - %6 = affine.apply #map0()[%workgroup_count_x] - scf.for %arg1 = %5 to %c370 step %6 { - %7 = affine.min #map1(%arg0) - %8 = affine.min #map2(%arg1) - %9 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%7, %8], strides = [1, 1] : !flow.dispatch.tensor> -> tensor - %10 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 144], strides = [1, 1] : !flow.dispatch.tensor> -> tensor - %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [144, %8], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<144x?xf32> - %12 = linalg.fill ins(%cst : f32) outs(%9 : tensor) -> tensor - %13 = scf.for %arg2 = %c0 to %c250 step %c32 iter_args(%arg3 = %12) -> (tensor) { - %14 = scf.for %arg4 = %c0 to %c370 step %c32 iter_args(%arg5 = %arg3) -> (tensor) { - %15 = scf.for %arg6 = %c0 to %c144 step %c24 iter_args(%arg7 = %arg5) -> (tensor) { - %16 = affine.min #map3(%arg2) - %17 = affine.min #map4(%arg6) - %18 = tensor.extract_slice %10[%arg2, %arg6] [%16, %17] [1, 1] : tensor to tensor - %19 = affine.min #map5(%arg4) - %20 = tensor.extract_slice %11[%arg6, %arg4] [%17, %19] [1, 1] : tensor<144x?xf32> to tensor - %21 = tensor.dim %arg7, %c0 : tensor - %22 = affine.min #map6(%21, %arg2) - %23 = tensor.dim %arg7, %c1 : tensor - %24 = affine.min #map6(%23, %arg4) - %25 = tensor.extract_slice %arg7[%arg2, %arg4] [%22, %24] [1, 1] : tensor to tensor - %26 = linalg.matmul ins(%18, %20 : tensor, tensor) outs(%25 : tensor) -> tensor - %27 = tensor.insert_slice %26 into %arg7[%arg2, %arg4] [%22, %24] [1, 1] : tensor into tensor - scf.yield %27 : tensor - } - scf.yield %15 : tensor +func.func @l1_tiled_matmul() { + %cst = arith.constant 0.000000e+00 : f32 + %c32 = arith.constant 32 : index + %c24 = arith.constant 24 : index + %c144 = arith.constant 144 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c250 = arith.constant 250 : index + %c370 = arith.constant 370 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %3 = affine.apply #map0()[%workgroup_id_y] + %4 = affine.apply #map0()[%workgroup_count_y] + scf.for %arg0 = %3 to %c250 step %4 { + %5 = affine.apply #map0()[%workgroup_id_x] + %6 = affine.apply #map0()[%workgroup_count_x] + scf.for %arg1 = %5 to %c370 step %6 { + %7 = affine.min #map1(%arg0) + %8 = affine.min #map2(%arg1) + %9 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%7, %8], strides = [1, 1] : !flow.dispatch.tensor> -> tensor + %10 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 144], strides = [1, 1] : !flow.dispatch.tensor> -> tensor + %11 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [144, %8], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<144x?xf32> + %12 = linalg.fill ins(%cst : f32) outs(%9 : tensor) -> tensor + %13 = scf.for %arg2 = %c0 to %c250 step %c32 iter_args(%arg3 = %12) -> (tensor) { + %14 = scf.for %arg4 = %c0 to %c370 step %c32 iter_args(%arg5 = %arg3) -> (tensor) { + %15 = scf.for %arg6 = %c0 to %c144 step %c24 iter_args(%arg7 = %arg5) -> (tensor) { + %16 = affine.min #map3(%arg2) + %17 = affine.min #map4(%arg6) + %18 = tensor.extract_slice %10[%arg2, %arg6] [%16, %17] [1, 1] : tensor to tensor + %19 = affine.min #map5(%arg4) + %20 = tensor.extract_slice %11[%arg6, %arg4] [%17, %19] [1, 1] : tensor<144x?xf32> to tensor + %21 = tensor.dim %arg7, %c0 : tensor + %22 = affine.min #map6(%21, %arg2) + %23 = tensor.dim %arg7, %c1 : tensor + %24 = affine.min #map6(%23, %arg4) + %25 = tensor.extract_slice %arg7[%arg2, %arg4] [%22, %24] [1, 1] : tensor to tensor + %26 = linalg.matmul ins(%18, %20 : tensor, tensor) outs(%25 : tensor) -> tensor + %27 = tensor.insert_slice %26 into %arg7[%arg2, %arg4] [%22, %24] [1, 1] : tensor into tensor + scf.yield %27 : tensor } - scf.yield %14 : tensor + scf.yield %15 : tensor } - flow.dispatch.tensor.store %13, %2, offsets = [%arg0, %arg1], sizes = [%7, %8], strides = [1, 1] : tensor -> !flow.dispatch.tensor> + scf.yield %14 : tensor } + flow.dispatch.tensor.store %13, %2, offsets = [%arg0, %arg1], sizes = [%7, %8], strides = [1, 1] : tensor -> !flow.dispatch.tensor> } - return } + return } // CHECK-LABEL: l1_tiled_matmul // CHECK-DAG: %[[M:.+]] = arith.constant 250 : index @@ -1713,9 +1932,9 @@ module { // CHECK-DAG: %[[K:.+]] = arith.constant 144 : index // CHECK-DAG: %[[L1_MN_SIZE:.+]] = arith.constant 32 : index // CHECK-DAG: %[[L1_K_SIZE:.+]] = arith.constant 24 : index -// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<250x144xf32, #hal.descriptor_type> -// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<144x370xf32, #hal.descriptor_type> -// CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<250x370xf32, #hal.descriptor_type> +// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<250x144xf32, #hal.descriptor_type> +// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref<144x370xf32, #hal.descriptor_type> +// CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) : memref<250x370xf32, #hal.descriptor_type> // CHECK: scf.for %[[WORKGROUP_I:.+]] = %{{.*}} to %[[M]] step %{{.*}} { // CHECK: scf.for %[[WORKGROUP_J:.+]] = %{{.*}} to %[[N]] step %{{.*}} { // CHECK-DAG: %[[WORKGROUP_I_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_I]]) @@ -1737,48 +1956,52 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map0 = affine_map<()[s0, s1] -> (s1 * s0)> #map1 = affine_map<(d0)[s0, s1] -> (-d0 + s1, s0)> #map2 = affine_map<(d0)[s0] -> (d0 + s0)> -module { - func.func @tensor_insert_slice() { - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.constant.load[3] : index - %4 = hal.interface.constant.load[4] : index - %5 = hal.interface.constant.load[5] : index - %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%4, %5} - %workgroup_size_x = hal.interface.workgroup.size[0] : index - %workgroup_size_y = hal.interface.workgroup.size[1] : index - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %8 = affine.apply #map0()[%workgroup_size_y, %workgroup_id_y] - %9 = affine.apply #map0()[%workgroup_size_y, %workgroup_count_y] - scf.for %arg0 = %8 to %2 step %9 { - %10 = affine.min #map1(%arg0)[%workgroup_size_y, %2] - %11 = affine.apply #map0()[%workgroup_size_x, %workgroup_id_x] - %12 = affine.apply #map0()[%workgroup_size_x, %workgroup_count_x] - scf.for %arg1 = %11 to %3 step %12 { - %13 = affine.min #map1(%arg1)[%workgroup_size_x, %3] - %14 = flow.dispatch.tensor.load %6, offsets = [%arg0, %arg1], sizes = [%10, %13], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor - %15 = affine.apply #map2(%arg0)[%0] - %16 = affine.apply #map2(%arg1)[%1] - flow.dispatch.tensor.store %14, %7, offsets = [%15, %16], sizes = [%10, %13], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%4, %5} - } +func.func @tensor_insert_slice() { + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : index + %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : index + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%2, %3} + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%4, %5} + %workgroup_size_x = hal.interface.workgroup.size[0] : index + %workgroup_size_y = hal.interface.workgroup.size[1] : index + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %8 = affine.apply #map0()[%workgroup_size_y, %workgroup_id_y] + %9 = affine.apply #map0()[%workgroup_size_y, %workgroup_count_y] + scf.for %arg0 = %8 to %2 step %9 { + %10 = affine.min #map1(%arg0)[%workgroup_size_y, %2] + %11 = affine.apply #map0()[%workgroup_size_x, %workgroup_id_x] + %12 = affine.apply #map0()[%workgroup_size_x, %workgroup_count_x] + scf.for %arg1 = %11 to %3 step %12 { + %13 = affine.min #map1(%arg1)[%workgroup_size_x, %3] + %14 = flow.dispatch.tensor.load %6, offsets = [%arg0, %arg1], sizes = [%10, %13], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor + %15 = affine.apply #map2(%arg0)[%0] + %16 = affine.apply #map2(%arg1)[%1] + flow.dispatch.tensor.store %14, %7, offsets = [%15, %16], sizes = [%10, %13], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%4, %5} } - return } + return } // CHECK: #[[MAP:.+]] = affine_map<(d0)[s0] -> (d0 + s0)> // CHECK: func.func @tensor_insert_slice() -// CHECK-DAG: %[[SRC:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref> -// CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref> -// CHECK-DAG: %[[OFFSET_Y:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[OFFSET_X:.+]] = hal.interface.constant.load[1] +// CHECK-DAG: %[[SRC:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref> +// CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref> +// CHECK-DAG: %[[OFFSET_Y:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) +// CHECK-DAG: %[[OFFSET_X:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) // CHECK: scf.for %[[IV0:.+]] = // CHECK: scf.for %[[IV1:.+]] = // CHECK-DAG: %[[SRC_VIEW:.+]] = memref.subview %[[SRC]][%[[IV0]], %[[IV1]]] @@ -1789,42 +2012,47 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map0 = affine_map<()[s0] -> (s0 * 64)> #map1 = affine_map<(d0)[s0] -> (-d0 + s0, 64)> #map2 = affine_map<(d0)[s0] -> (d0 + s0)> -module { - func.func @dynamic_update_slice() { - %c0_i32 = arith.constant 0 : i32 - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0} - %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%1, %0} - %5 = flow.dispatch.tensor.load %3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor - %6 = tensor.extract %5[] : tensor - %7 = arith.cmpi slt, %6, %c0_i32 : i32 - %8 = arith.select %7, %6, %c0_i32 : i32 - %9 = arith.cmpi sgt, %8, %c0_i32 : i32 - %10 = arith.select %9, %8, %c0_i32 : i32 - %11 = arith.index_cast %10 : i32 to index - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %12 = affine.apply #map0()[%workgroup_id_x] - %13 = affine.apply #map0()[%workgroup_count_x] - scf.for %arg0 = %12 to %0 step %13 { - %14 = affine.min #map1(%arg0)[%0] - %15 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [%14], strides = [1] : !flow.dispatch.tensor>{%0} -> tensor - %16 = affine.apply #map2(%arg0)[%11] - flow.dispatch.tensor.store %15, %4, offsets = [0, %16], sizes = [1, %14], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%1, %0} - } - return +func.func @dynamic_update_slice() { + %c0_i32 = arith.constant 0 : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0} + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%1, %0} + %5 = flow.dispatch.tensor.load %3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor + %6 = tensor.extract %5[] : tensor + %7 = arith.cmpi slt, %6, %c0_i32 : i32 + %8 = arith.select %7, %6, %c0_i32 : i32 + %9 = arith.cmpi sgt, %8, %c0_i32 : i32 + %10 = arith.select %9, %8, %c0_i32 : i32 + %11 = arith.index_cast %10 : i32 to index + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %12 = affine.apply #map0()[%workgroup_id_x] + %13 = affine.apply #map0()[%workgroup_count_x] + scf.for %arg0 = %12 to %0 step %13 { + %14 = affine.min #map1(%arg0)[%0] + %15 = flow.dispatch.tensor.load %2, offsets = [%arg0], sizes = [%14], strides = [1] : !flow.dispatch.tensor>{%0} -> tensor + %16 = affine.apply #map2(%arg0)[%11] + flow.dispatch.tensor.store %15, %4, offsets = [0, %16], sizes = [1, %14], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%1, %0} } + return } // CHECK-LABEL: func.func @dynamic_update_slice() -// CHECK-DAG: %[[SRC:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref> -// CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref> -// CHECK-DAG: %[[OFFSET_Y:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[OFFSET_X:.+]] = hal.interface.constant.load[1] +// CHECK-DAG: %[[SRC:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref> +// CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) : memref> +// CHECK-DAG: %[[OFFSET_Y:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[OFFSET_X:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) // CHECK: scf.for %[[IV0:.+]] = // CHECK: %[[SRC_VIEW:.+]] = memref.subview %[[SRC]][%[[IV0]]] // CHECK-SAME: : memref to memref{{.+}}> @@ -1834,88 +2062,94 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map0 = affine_map<()[s0, s1] -> (s0 * s1)> #map1 = affine_map<(d0)[s0, s1] -> (-d0 + s1, s0)> #map2 = affine_map<(d0, d1) -> (4, d0 - d1)> #map3 = affine_map<(d0, d1) -> (d0 + d1)> #map4 = affine_map<(d0, d1) -> ()> #map5 = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @multi_level_tile_fuse() { - %c4 = arith.constant 4 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %7 = flow.dispatch.tensor.load %5, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_size_x = hal.interface.workgroup.size[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %workgroup_size_y = hal.interface.workgroup.size[1] : index - %8 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y] - %9 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y] - scf.for %arg0 = %8 to %0 step %9 { - %10 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x] - %11 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x] - scf.for %arg1 = %10 to %1 step %11 { - %12 = affine.min #map1(%arg0)[%workgroup_size_y, %0] - %13 = affine.min #map1(%arg1)[%workgroup_size_x, %1] - %14 = flow.dispatch.tensor.load %6, offsets = [%arg0, %arg1], sizes = [%12, %13], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %15 = flow.dispatch.tensor.load %3, offsets = [%arg0, 0], sizes = [%12, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor - %16 = flow.dispatch.tensor.load %4, offsets = [0, %arg1], sizes = [%2, %13], strides = [1, 1] : !flow.dispatch.tensor>{%2, %1} -> tensor - %17 = linalg.fill ins(%cst : f32) outs(%14 : tensor) -> tensor - %18 = scf.for %arg2 = %c0 to %12 step %c4 iter_args(%arg3 = %17) -> (tensor) { - %20 = scf.for %arg4 = %c0 to %13 step %c4 iter_args(%arg5 = %arg3) -> (tensor) { - %21 = affine.min #map2(%12, %arg2) - %22 = affine.min #map2(%13, %arg4) - %23 = tensor.extract_slice %arg5[%arg2, %arg4] [%21, %22] [1, 1] : tensor to tensor - %24 = scf.for %arg6 = %c0 to %21 step %c4 iter_args(%arg7 = %23) -> (tensor) { - %26 = scf.for %arg8 = %c0 to %22 step %c4 iter_args(%arg9 = %arg7) -> (tensor) { - %27 = affine.min #map2(%21, %arg6) - %28 = affine.apply #map3(%arg6, %arg2) - %29 = tensor.extract_slice %15[%28, 0] [%27, %2] [1, 1] : tensor to tensor - %30 = affine.min #map2(%22, %arg8) - %31 = affine.apply #map3(%arg8, %arg4) - %32 = tensor.extract_slice %16[0, %31] [%2, %30] [1, 1] : tensor to tensor - %33 = tensor.extract_slice %arg9[%arg6, %arg8] [%27, %30] [1, 1] : tensor to tensor - %34 = linalg.matmul ins(%29, %32 : tensor, tensor) outs(%33 : tensor) -> tensor - %35 = tensor.insert_slice %34 into %arg9[%arg6, %arg8] [%27, %30] [1, 1] : tensor into tensor - scf.yield %35 : tensor - } - scf.yield %26 : tensor +func.func @multi_level_tile_fuse() { + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %2} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %1} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%0, %1} + %7 = flow.dispatch.tensor.load %5, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_size_x = hal.interface.workgroup.size[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %workgroup_size_y = hal.interface.workgroup.size[1] : index + %8 = affine.apply #map0()[%workgroup_id_y, %workgroup_size_y] + %9 = affine.apply #map0()[%workgroup_count_y, %workgroup_size_y] + scf.for %arg0 = %8 to %0 step %9 { + %10 = affine.apply #map0()[%workgroup_id_x, %workgroup_size_x] + %11 = affine.apply #map0()[%workgroup_count_x, %workgroup_size_x] + scf.for %arg1 = %10 to %1 step %11 { + %12 = affine.min #map1(%arg0)[%workgroup_size_y, %0] + %13 = affine.min #map1(%arg1)[%workgroup_size_x, %1] + %14 = flow.dispatch.tensor.load %6, offsets = [%arg0, %arg1], sizes = [%12, %13], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %15 = flow.dispatch.tensor.load %3, offsets = [%arg0, 0], sizes = [%12, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor + %16 = flow.dispatch.tensor.load %4, offsets = [0, %arg1], sizes = [%2, %13], strides = [1, 1] : !flow.dispatch.tensor>{%2, %1} -> tensor + %17 = linalg.fill ins(%cst : f32) outs(%14 : tensor) -> tensor + %18 = scf.for %arg2 = %c0 to %12 step %c4 iter_args(%arg3 = %17) -> (tensor) { + %20 = scf.for %arg4 = %c0 to %13 step %c4 iter_args(%arg5 = %arg3) -> (tensor) { + %21 = affine.min #map2(%12, %arg2) + %22 = affine.min #map2(%13, %arg4) + %23 = tensor.extract_slice %arg5[%arg2, %arg4] [%21, %22] [1, 1] : tensor to tensor + %24 = scf.for %arg6 = %c0 to %21 step %c4 iter_args(%arg7 = %23) -> (tensor) { + %26 = scf.for %arg8 = %c0 to %22 step %c4 iter_args(%arg9 = %arg7) -> (tensor) { + %27 = affine.min #map2(%21, %arg6) + %28 = affine.apply #map3(%arg6, %arg2) + %29 = tensor.extract_slice %15[%28, 0] [%27, %2] [1, 1] : tensor to tensor + %30 = affine.min #map2(%22, %arg8) + %31 = affine.apply #map3(%arg8, %arg4) + %32 = tensor.extract_slice %16[0, %31] [%2, %30] [1, 1] : tensor to tensor + %33 = tensor.extract_slice %arg9[%arg6, %arg8] [%27, %30] [1, 1] : tensor to tensor + %34 = linalg.matmul ins(%29, %32 : tensor, tensor) outs(%33 : tensor) -> tensor + %35 = tensor.insert_slice %34 into %arg9[%arg6, %arg8] [%27, %30] [1, 1] : tensor into tensor + scf.yield %35 : tensor } - %25 = tensor.insert_slice %24 into %arg5[%arg2, %arg4] [%21, %22] [1, 1] : tensor into tensor - scf.yield %25 : tensor + scf.yield %26 : tensor } - scf.yield %20 : tensor + %25 = tensor.insert_slice %24 into %arg5[%arg2, %arg4] [%21, %22] [1, 1] : tensor into tensor + scf.yield %25 : tensor } - %19 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor) outs(%18 : tensor) { - ^bb0(%arg2: f32, %arg3: f32): - %20 = arith.addf %arg2, %arg3 : f32 - linalg.yield %20 : f32 - } -> tensor - flow.dispatch.tensor.store %19, %6, offsets = [%arg0, %arg1], sizes = [%12, %13], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} + scf.yield %20 : tensor } + %19 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor) outs(%18 : tensor) { + ^bb0(%arg2: f32, %arg3: f32): + %20 = arith.addf %arg2, %arg3 : f32 + linalg.yield %20 : f32 + } -> tensor + flow.dispatch.tensor.store %19, %6, offsets = [%arg0, %arg1], sizes = [%12, %13], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} } - return } + return } // CHECK-LABEL: func.func @multi_level_tile_fuse() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] -// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref>{%[[M]], %[[K]]} -// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref>{%[[K]], %[[N]]} -// CHECK-DAG: %[[SCALAR:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref> -// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : memref>{%[[M]], %[[N]]} +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) +// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref>{%[[M]], %[[K]]} +// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref>{%[[K]], %[[N]]} +// CHECK-DAG: %[[SCALAR:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) : memref> +// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) : memref>{%[[M]], %[[N]]} // CHECK: scf.for // CHECK: scf.for // CHECK-DAG: %[[LHS_SUBVIEW1:.+]] = memref.subview %[[LHS]] @@ -1940,60 +2174,66 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map0 = affine_map<()[s0] -> (s0 * 4)> #map1 = affine_map<(d0) -> (-d0 + 2, 4)> #map2 = affine_map<(d0) -> (-d0 + 1, 4)> #map3 = affine_map<(d0, d1) -> ()> #map4 = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @operand_fusion() { - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %7 = flow.dispatch.tensor.load %5, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %8 = affine.apply #map0()[%workgroup_id_y] - %9 = affine.apply #map0()[%workgroup_count_y] - scf.for %arg0 = %8 to %c2 step %9 { - %10 = affine.apply #map0()[%workgroup_id_x] - %11 = affine.apply #map0()[%workgroup_count_x] - scf.for %arg1 = %10 to %c1 step %11 { - %12 = affine.min #map1(%arg0) - %13 = affine.min #map2(%arg1) - %14 = flow.dispatch.tensor.load %6, offsets = [%arg0, %arg1], sizes = [%12, %13], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %15 = flow.dispatch.tensor.load %3, offsets = [%arg0, 0], sizes = [%12, 3], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor - %16 = flow.dispatch.tensor.load %4, offsets = [0, %arg1], sizes = [3, %13], strides = [1, 1] : !flow.dispatch.tensor>{%2, %1} -> tensor - %17 = linalg.fill ins(%cst : f32) outs(%14 : tensor) -> tensor - %18 = linalg.matmul ins(%15, %16 : tensor, tensor) outs(%17 : tensor) -> tensor - %19 = linalg.generic {indexing_maps = [#map3, #map4], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor) outs(%18 : tensor) { - ^bb0(%arg2: f32, %arg3: f32): - %20 = arith.addf %arg2, %arg3 : f32 - linalg.yield %20 : f32 - } -> tensor - flow.dispatch.tensor.store %19, %6, offsets = [%arg0, %arg1], sizes = [%12, %13], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} - } +func.func @operand_fusion() { + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %2} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %1} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%0, %1} + %7 = flow.dispatch.tensor.load %5, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %8 = affine.apply #map0()[%workgroup_id_y] + %9 = affine.apply #map0()[%workgroup_count_y] + scf.for %arg0 = %8 to %c2 step %9 { + %10 = affine.apply #map0()[%workgroup_id_x] + %11 = affine.apply #map0()[%workgroup_count_x] + scf.for %arg1 = %10 to %c1 step %11 { + %12 = affine.min #map1(%arg0) + %13 = affine.min #map2(%arg1) + %14 = flow.dispatch.tensor.load %6, offsets = [%arg0, %arg1], sizes = [%12, %13], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %15 = flow.dispatch.tensor.load %3, offsets = [%arg0, 0], sizes = [%12, 3], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor + %16 = flow.dispatch.tensor.load %4, offsets = [0, %arg1], sizes = [3, %13], strides = [1, 1] : !flow.dispatch.tensor>{%2, %1} -> tensor + %17 = linalg.fill ins(%cst : f32) outs(%14 : tensor) -> tensor + %18 = linalg.matmul ins(%15, %16 : tensor, tensor) outs(%17 : tensor) -> tensor + %19 = linalg.generic {indexing_maps = [#map3, #map4], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor) outs(%18 : tensor) { + ^bb0(%arg2: f32, %arg3: f32): + %20 = arith.addf %arg2, %arg3 : f32 + linalg.yield %20 : f32 + } -> tensor + flow.dispatch.tensor.store %19, %6, offsets = [%arg0, %arg1], sizes = [%12, %13], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} } - return } + return } // CHECK-LABEL: func.func @operand_fusion() -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] -// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref>{%[[M]], %[[K]]} -// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref>{%[[K]], %[[N]]} -// CHECK-DAG: %[[SCALAR:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref> -// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : memref>{%[[M]], %[[N]]} +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) +// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref>{%[[M]], %[[K]]} +// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref>{%[[K]], %[[N]]} +// CHECK-DAG: %[[SCALAR:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) : memref> +// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) : memref>{%[[M]], %[[N]]} // CHECK: scf.for // CHECK: scf.for // CHECK-DAG: %[[LHS_SUBVIEW1:.+]] = memref.subview %[[LHS]] @@ -2010,138 +2250,88 @@ module { // ----- -// This test is a repro from a failure. No checking needed. -#map0 = affine_map<()[s0] -> (s0 * 64)> -#map1 = affine_map<(d0)[s0] -> (-d0 + s0, 64)> -#map2 = affine_map<(d0, d1) -> (d0)> -#map3 = affine_map<(d0, d1) -> (d0, d1)> -#map4 = affine_map<(d0, d1) -> (d0 + d1)> -module { - func.func @forward_dispatch_3() { - %c384 = arith.constant 384 : index - %c512_i64 = arith.constant 512 : i64 - %c0_i64 = arith.constant 0 : i64 - %c0 = arith.constant 0 : index - %c592896 = arith.constant 592896 : index - %c47481856 = arith.constant 47481856 : index - %c64 = arith.constant 64 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = arith.index_cast %0 : i32 to index - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c592896) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c47481856) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%1} - %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x512xi64> - %6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [512, 384], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x384xf32> - %7 = tensor.extract_slice %5[0, 0] [1, %1] [1, 1] : tensor<1x512xi64> to tensor - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %8 = affine.apply #map0()[%workgroup_id_y] - %9 = affine.apply #map0()[%workgroup_count_y] - scf.for %arg0 = %8 to %1 step %9 { - %10 = affine.apply #map0()[%workgroup_id_x] - %11 = affine.apply #map0()[%workgroup_count_x] - scf.for %arg1 = %10 to %c384 step %11 { - %12 = affine.min #map1(%arg0)[%1] - %13 = flow.dispatch.tensor.load %4, offsets = [%arg0, %arg1], sizes = [%12, 64], strides = [1, 1] : !flow.dispatch.tensor>{%1} -> tensor - %14 = tensor.extract_slice %7[%arg0] [%12] [1] : tensor to tensor - %15 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%14 : tensor) outs(%13 : tensor) { - ^bb0(%arg2: i64, %arg3: f32): - %16 = arith.index_cast %arg2 : i64 to index - %17 = linalg.index 1 : index - %18 = affine.apply #map4(%17, %arg1) - %19 = arith.cmpi slt, %arg2, %c512_i64 : i64 - cf.assert %19, "index must be smaller than dim size" - %20 = arith.cmpi sge, %arg2, %c0_i64 : i64 - cf.assert %20, "index must be larger or equal to 0" - %21 = tensor.extract %6[%16, %18] : tensor<512x384xf32> - linalg.yield %21 : f32 - } -> tensor - flow.dispatch.tensor.store %15, %4, offsets = [%arg0, %arg1], sizes = [%12, 64], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%1} - } - } - return - } -} -// CHECK: func.func @forward_dispatch_3() - -// ----- - +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map0 = affine_map<()[s0] -> (s0 * 4)> #map1 = affine_map<()[s0] -> (s0 * 2)> #map2 = affine_map<(d0) -> (-d0 + 6, 4)> #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> -module { - func.func @dot_general_nontrivial_batching_mutliple_parallel_dimension() { - %cst = arith.constant dense<0.000000e+00> : vector<1x4x2xf32> - %c1 = arith.constant 1 : index - %c6 = arith.constant 6 : index - %c2 = arith.constant 2 : index - %cst_0 = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c64) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %workgroup_id_z = hal.interface.workgroup.id[2] : index - %workgroup_count_z = hal.interface.workgroup.count[2] : index - %3 = affine.apply #map0()[%workgroup_id_y] - %4 = affine.apply #map0()[%workgroup_count_y] - %5 = affine.apply #map1()[%workgroup_id_x] - %6 = affine.apply #map1()[%workgroup_count_x] - scf.for %arg0 = %workgroup_id_z to %c2 step %workgroup_count_z { - scf.for %arg1 = %3 to %c6 step %4 { - %7 = affine.min #map2(%arg1) - %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [1, %7, 1], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x?x1xf32> - %9 = tensor.extract_slice %8[0, 0, 0] [1, %7, 1] [1, 1, 1] : tensor<1x?x1xf32> to tensor<1x?x1xf32> - %10 = vector.transfer_read %9[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : tensor<1x?x1xf32>, vector<1x4x1xf32> - scf.for %arg2 = %5 to %c2 step %6 { - %11 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, %7, 2], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x?x2xf32> - %12 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [1, 1, 2], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x1x2xf32> - %13 = tensor.extract_slice %11[0, 0, 0] [1, %7, 2] [1, 1, 1] : tensor<1x?x2xf32> to tensor<1x?x2xf32> - %14 = vector.transfer_write %cst, %13[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x4x2xf32>, tensor<1x?x2xf32> - %15 = tensor.extract_slice %14[0, 0, 0] [1, %7, 2] [1, 1, 1] : tensor<1x?x2xf32> to tensor<1x?x2xf32> - %16 = vector.transfer_read %12[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x2xf32>, vector<1x1x2xf32> - %17 = vector.transfer_read %15[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : tensor<1x?x2xf32>, vector<1x4x2xf32> - %18 = vector.contract {indexing_maps = [#map3, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind} %10, %16, %17 : vector<1x4x1xf32>, vector<1x1x2xf32> into vector<1x4x2xf32> - %19 = vector.transfer_write %18, %15[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x4x2xf32>, tensor<1x?x2xf32> - %20 = tensor.insert_slice %19 into %14[0, 0, 0] [1, %7, 2] [1, 1, 1] : tensor<1x?x2xf32> into tensor<1x?x2xf32> - %21 = tensor.insert_slice %20 into %11[0, 0, 0] [1, %7, 2] [1, 1, 1] : tensor<1x?x2xf32> into tensor<1x?x2xf32> - flow.dispatch.tensor.store %21, %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, %7, 2], strides = [1, 1, 1] : tensor<1x?x2xf32> -> !flow.dispatch.tensor> - } +func.func @dot_general_nontrivial_batching_mutliple_parallel_dimension() { + %cst = arith.constant dense<0.000000e+00> : vector<1x4x2xf32> + %c1 = arith.constant 1 : index + %c6 = arith.constant 6 : index + %c2 = arith.constant 2 : index + %cst_0 = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) offset(%c64) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) offset(%c0) : !flow.dispatch.tensor> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %workgroup_id_z = hal.interface.workgroup.id[2] : index + %workgroup_count_z = hal.interface.workgroup.count[2] : index + %3 = affine.apply #map0()[%workgroup_id_y] + %4 = affine.apply #map0()[%workgroup_count_y] + %5 = affine.apply #map1()[%workgroup_id_x] + %6 = affine.apply #map1()[%workgroup_count_x] + scf.for %arg0 = %workgroup_id_z to %c2 step %workgroup_count_z { + scf.for %arg1 = %3 to %c6 step %4 { + %7 = affine.min #map2(%arg1) + %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [1, %7, 1], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x?x1xf32> + %9 = tensor.extract_slice %8[0, 0, 0] [1, %7, 1] [1, 1, 1] : tensor<1x?x1xf32> to tensor<1x?x1xf32> + %10 = vector.transfer_read %9[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : tensor<1x?x1xf32>, vector<1x4x1xf32> + scf.for %arg2 = %5 to %c2 step %6 { + %11 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, %7, 2], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x?x2xf32> + %12 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [1, 1, 2], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x1x2xf32> + %13 = tensor.extract_slice %11[0, 0, 0] [1, %7, 2] [1, 1, 1] : tensor<1x?x2xf32> to tensor<1x?x2xf32> + %14 = vector.transfer_write %cst, %13[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x4x2xf32>, tensor<1x?x2xf32> + %15 = tensor.extract_slice %14[0, 0, 0] [1, %7, 2] [1, 1, 1] : tensor<1x?x2xf32> to tensor<1x?x2xf32> + %16 = vector.transfer_read %12[%c0, %c0, %c0], %cst_0 {in_bounds = [true, true, true]} : tensor<1x1x2xf32>, vector<1x1x2xf32> + %17 = vector.transfer_read %15[%c0, %c0, %c0], %cst_0 {in_bounds = [true, false, true]} : tensor<1x?x2xf32>, vector<1x4x2xf32> + %18 = vector.contract {indexing_maps = [#map3, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "reduction"], kind = #vector.kind} %10, %16, %17 : vector<1x4x1xf32>, vector<1x1x2xf32> into vector<1x4x2xf32> + %19 = vector.transfer_write %18, %15[%c0, %c0, %c0] {in_bounds = [true, false, true]} : vector<1x4x2xf32>, tensor<1x?x2xf32> + %20 = tensor.insert_slice %19 into %14[0, 0, 0] [1, %7, 2] [1, 1, 1] : tensor<1x?x2xf32> into tensor<1x?x2xf32> + %21 = tensor.insert_slice %20 into %11[0, 0, 0] [1, %7, 2] [1, 1, 1] : tensor<1x?x2xf32> into tensor<1x?x2xf32> + flow.dispatch.tensor.store %21, %2, offsets = [%arg0, %arg1, %arg2], sizes = [1, %7, 2], strides = [1, 1, 1] : tensor<1x?x2xf32> -> !flow.dispatch.tensor> } } - return } + return } // CHECK-LABEL: func.func @dot_general_nontrivial_batching_mutliple_parallel_dimension() // CHECK-NOT: memref.alloc // ----- -module { - func.func @no_op_subview() { - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0, %1} - %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0, %1} - %4 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %5 = tensor.extract_slice %4[0, 0] [%0, %1] [1, 1] : tensor to tensor - flow.dispatch.tensor.store %5, %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @no_op_subview() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0, %1} + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0, %1} + %4 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %5 = tensor.extract_slice %4[0, 0] [%0, %1] [1, 1] : tensor to tensor + flow.dispatch.tensor.store %5, %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} + return } // CHECK-LABEL: func.func @no_op_subview() -// CHECK-DAG: %[[SRC:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[DEST:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[SRC:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[DEST:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[SRC_DUP:.+]] = memref.subview %[[SRC]] // CHECK: linalg.generic // CHECK-SAME: ins(%[[SRC_DUP]] : @@ -2149,11 +2339,17 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @rank_reducing_no_op_subview() { %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0} - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0} + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0} + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0} %3 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, %0], strides = [1, 1] : !flow.dispatch.tensor>{%0} -> tensor<1x?xf32> %4 = tensor.extract_slice %3[0, 0] [1, %0] [1, 1] : tensor<1x?xf32> to tensor flow.dispatch.tensor.store %4, %2, offsets = [0], sizes = [%0], strides = [1] : tensor -> !flow.dispatch.tensor>{%0} @@ -2161,8 +2357,8 @@ func.func @rank_reducing_no_op_subview() { } // CHECK-LABEL: func.func @rank_reducing_no_op_subview() -// CHECK-DAG: %[[SRC:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[DEST:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[SRC:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[DEST:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][0, 0] [1, %{{.+}}] // CHECK: linalg.generic // CHECK-SAME: ins(%[[SUBVIEW]] : @@ -2186,11 +2382,18 @@ func.func @fft_tensor(%idx: index) -> (tensor<1024xf32>, tensor<1024xf32>) { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @scan_1d_dim0_inclusive_sum() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [6], strides = [1] : !flow.dispatch.tensor> -> tensor<6xf32> %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [6], strides = [1] : !flow.dispatch.tensor> -> tensor<6xf32> %5 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor @@ -2211,9 +2414,14 @@ func.func @scan_1d_dim0_inclusive_sum() { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @sort1D() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> %1 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor> -> tensor<4xi32> %2 = iree_linalg_ext.sort dimension(0) outs(%1 : tensor<4xi32>) { ^bb0(%arg0: i32, %arg1: i32): @@ -2224,18 +2432,25 @@ func.func @sort1D() { return } // CHECK-LABEL: func.func @sort1D -// CHECK: %[[BUF:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<4xi32, #hal.descriptor_type> +// CHECK: %[[BUF:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%c0) : memref<4xi32, #hal.descriptor_type> // CHECK: iree_linalg_ext.sort // CHECK-SAME: outs(%[[BUF]] : memref<4xi32{{.+}}>) // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @scatter_update_scalar_1D() { %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [8], strides = [1] : !flow.dispatch.tensor> -> tensor<8xi32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index @@ -2253,9 +2468,9 @@ func.func @scatter_update_scalar_1D() { return } // CHECK: func.func @scatter_update_scalar_1D -// CHECK-DAG: %[[UPDATE:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<4xi32, #hal.descriptor_type> -// CHECK-DAG: %[[INDICES:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<4x1xi32, #hal.descriptor_type> -// CHECK-DAG: %[[ORIGINAL:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<8xi32, #hal.descriptor_type> +// CHECK-DAG: %[[UPDATE:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%c0) : memref<4xi32, #hal.descriptor_type> +// CHECK-DAG: %[[INDICES:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%c0) : memref<4x1xi32, #hal.descriptor_type> +// CHECK-DAG: %[[ORIGINAL:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) alignment(64) offset(%c0) : memref<8xi32, #hal.descriptor_type> // CHECK: scf.for %[[I:.+]] = %{{.+}} to %{{.+}} step %{{.+}} // CHECK: iree_linalg_ext.scatter // CHECK-SAME: ins(%[[UPDATE]], %[[INDICES]] @@ -2263,11 +2478,17 @@ func.func @scatter_update_scalar_1D() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @topk() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> %input_values = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [200, 8], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<200x8xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %input_indices = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [200, 8], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<200x8xi32> %out_values = bufferization.alloc_tensor() : tensor<200x3xf32> %out_indices = bufferization.alloc_tensor() : tensor<200x3xi32> @@ -2282,8 +2503,8 @@ func.func @topk() { return } // CHECK: func.func @topk -// CHECK-DAG: %[[INPUT_VALUES:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<200x8xf32, #hal.descriptor_type> -// CHECK-DAG: %[[INPUT_INDICES:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<200x8xi32, #hal.descriptor_type> +// CHECK-DAG: %[[INPUT_VALUES:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<200x8xf32, #hal.descriptor_type> +// CHECK-DAG: %[[INPUT_INDICES:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref<200x8xi32, #hal.descriptor_type> // CHECK-DAG: %[[OUTPUT_VALUES:.+]] = memref.alloc() : memref<200x3xf32> // CHECK-DAG: %[[OUTPUT_INDICES:.+]] = memref.alloc() : memref<200x3xi32> // CHECK: iree_linalg_ext.topk @@ -2292,11 +2513,17 @@ func.func @topk() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @iree_linalg_ext_pack() { %c0 = arith.constant 0 : index %c0_i32 = arith.constant 0 : i32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [2, 2, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x2x3x3xi32> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4, 4], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4x4xi32> %4 = iree_linalg_ext.pack %3 padding_value(%c0_i32 : i32) inner_dims_pos = [0, 1] inner_tiles = [3, 3] into %2 : (tensor<4x4xi32> tensor<2x2x3x3xi32>) -> tensor<2x2x3x3xi32> @@ -2305,18 +2532,24 @@ func.func @iree_linalg_ext_pack() { } // CHECK: func.func @iree_linalg_ext_pack // CHECK-DAG: %[[PAD:.+]] = arith.constant 0 : i32 -// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<4x4xi32, #hal.descriptor_type> -// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x2x3x3xi32, #hal.descriptor_type> +// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%c0) : memref<4x4xi32, #hal.descriptor_type> +// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%c0) : memref<2x2x3x3xi32, #hal.descriptor_type> // CHECK: iree_linalg_ext.pack %[[IN]] // CHECK-SAME: padding_value(%[[PAD]] : i32) // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [3, 3] into %[[OUT]] // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @iree_linalg_ext_unpack() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4, 4], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4x4xi32> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 2, 2, 2], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x2x2x2xi32> %4 = iree_linalg_ext.unpack %3 inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %2 : (tensor<2x2x2x2xi32> tensor<4x4xi32>) -> tensor<4x4xi32> @@ -2324,18 +2557,24 @@ func.func @iree_linalg_ext_unpack() { return } // CHECK: func.func @iree_linalg_ext_unpack -// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<2x2x2x2xi32, #hal.descriptor_type> -// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<4x4xi32, #hal.descriptor_type> +// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%c0) : memref<2x2x2x2xi32, #hal.descriptor_type> +// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%c0) : memref<4x4xi32, #hal.descriptor_type> // CHECK: iree_linalg_ext.unpack %[[IN]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %[[OUT]] // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @iree_linalg_ext_unpack_fully_dynamic() { %c0 = arith.constant 0 : index %inner_d0 = util.unfoldable_constant 2 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4, 4], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4x4xi32> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 2, %inner_d0, %inner_d0], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x2x?x?xi32> %4 = iree_linalg_ext.unpack %3 inner_dims_pos = [0, 1] inner_tiles = [%inner_d0, %inner_d0] into %2 : (tensor<2x2x?x?xi32> tensor<4x4xi32>) -> tensor<4x4xi32> @@ -2350,11 +2589,17 @@ func.func @iree_linalg_ext_unpack_fully_dynamic() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @tensor_pack() { %c0 = arith.constant 0 : index %c0_i32 = arith.constant 0 : i32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [2, 2, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x2x3x3xi32> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4, 4], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4x4xi32> %4 = tensor.pack %3 padding_value(%c0_i32 : i32) inner_dims_pos = [0, 1] inner_tiles = [3, 3] into %2 : tensor<4x4xi32> -> tensor<2x2x3x3xi32> @@ -2363,18 +2608,24 @@ func.func @tensor_pack() { } // CHECK: func.func @tensor_pack // CHECK-DAG: %[[PAD:.+]] = arith.constant 0 : i32 -// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<4x4xi32, #hal.descriptor_type> -// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2x2x3x3xi32, #hal.descriptor_type> +// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%c0) : memref<4x4xi32, #hal.descriptor_type> +// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%c0) : memref<2x2x3x3xi32, #hal.descriptor_type> // CHECK: iree_linalg_ext.pack %[[IN]] // CHECK-SAME: padding_value(%[[PAD]] : i32) // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [3, 3] into %[[OUT]] // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @tensor_unpack() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4, 4], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4x4xi32> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 2, 2, 2], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x2x2x2xi32> %4 = tensor.unpack %3 inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %2 : tensor<2x2x2x2xi32> -> tensor<4x4xi32> @@ -2382,18 +2633,24 @@ func.func @tensor_unpack() { return } // CHECK: func.func @tensor_unpack -// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<2x2x2x2xi32, #hal.descriptor_type> -// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<4x4xi32, #hal.descriptor_type> +// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%c0) : memref<2x2x2x2xi32, #hal.descriptor_type> +// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%c0) : memref<4x4xi32, #hal.descriptor_type> // CHECK: iree_linalg_ext.unpack %[[IN]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %[[OUT]] // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @tensor_unpack_fully_dynamic() { %c0 = arith.constant 0 : index %inner_d0 = util.unfoldable_constant 2 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4, 4], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4x4xi32> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 2, %inner_d0, %inner_d0], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x2x?x?xi32> %4 = tensor.unpack %3 inner_dims_pos = [0, 1] inner_tiles = [%inner_d0, %inner_d0] into %2 : tensor<2x2x?x?xi32> -> tensor<4x4xi32> @@ -2408,14 +2665,20 @@ func.func @tensor_unpack_fully_dynamic() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @reduction_ew() { %c5120 = arith.constant 5120 : index %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 %cst_0 = arith.constant 1.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c5120) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c5120) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c5120) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c5120) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 1001], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1001xf32> %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [1001], strides = [1] : !flow.dispatch.tensor> -> tensor<1001xf32> %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 1001], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1001xf32> @@ -2437,21 +2700,29 @@ func.func @reduction_ew() { } // CHECK: func.func @reduction_ew -// CHECK: hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c5120) : memref<1001xf32, strided<[1], offset: 1280>, #hal.descriptor_type> -// CHECK: hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c5120) : memref<1x1001xf32, strided<[1001, 1], offset: 1280>, #hal.descriptor_type> -// CHECK: hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x1001xf32, #hal.descriptor_type> +// CHECK: hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%c5120) : memref<1001xf32, strided<[1], offset: 1280>, #hal.descriptor_type> +// CHECK: hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%c5120) : memref<1x1001xf32, strided<[1001, 1], offset: 1280>, #hal.descriptor_type> +// CHECK: hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%c0) : memref<1x1001xf32, #hal.descriptor_type> // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, uniform_buffer>, + #hal.descriptor_set.binding<2, uniform_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> func.func @uniform_storage_buffer() { %c0 = arith.constant 0 : index - %m = hal.interface.constant.load[0] : index - %n = hal.interface.constant.load[1] : index - %k = hal.interface.constant.load[2] : index - %lhs = hal.interface.binding.subspan set(0) binding(0) type(uniform_buffer) : !flow.dispatch.tensor>{%m, %k} - %rhs = hal.interface.binding.subspan set(0) binding(1) type(uniform_buffer) : !flow.dispatch.tensor>{%k, %n} - %init = hal.interface.binding.subspan set(0) binding(2) type(uniform_buffer) : !flow.dispatch.tensor>{%m, %n} - %result = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor>{%m, %n} + %m = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %n = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %k = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %lhs = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%m, %k} + %rhs = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%k, %n} + %init = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%m, %n} + %result = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%m, %n} %wg_id_y = hal.interface.workgroup.id[1] : index %wg_count_y = hal.interface.workgroup.count[1] : index %wg_size_y = hal.interface.workgroup.size[1] : index @@ -2477,22 +2748,30 @@ func.func @uniform_storage_buffer() { } // CHECK-LABEL: func.func @uniform_storage_buffer() -// CHECK: hal.interface.binding.subspan set(0) binding(0) type(uniform_buffer) : memref> -// CHECK: hal.interface.binding.subspan set(0) binding(1) type(uniform_buffer) : memref> -// CHECK: hal.interface.binding.subspan set(0) binding(2) type(uniform_buffer) : memref> -// CHECK: hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : memref> +// CHECK: hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref> +// CHECK: hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref> +// CHECK: hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) : memref> +// CHECK: hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) : memref> // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, uniform_buffer> + ]> +]> func.func @micro_kernel_op() { - %d0 = hal.interface.constant.load[0] : index - %d1 = hal.interface.constant.load[1] : index - %s0 = hal.interface.constant.load[2] : f32 - %s1 = hal.interface.constant.load[3] : i64 - %arg0_binding = hal.interface.binding.subspan set(0) binding(0) type(uniform_buffer) : !flow.dispatch.tensor>{%d0, %d1} - %arg1_binding = hal.interface.binding.subspan set(0) binding(1) type(uniform_buffer) : !flow.dispatch.tensor>{%d0, %d1} - %arg2_binding = hal.interface.binding.subspan set(0) binding(2) type(uniform_buffer) : !flow.dispatch.tensor>{%d0, %d1} - %arg3_binding = hal.interface.binding.subspan set(0) binding(3) type(uniform_buffer) : !flow.dispatch.tensor>{%d0, %d1} + %d0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %d1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %s0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : f32 + %s1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i64 + %arg0_binding = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%d0, %d1} + %arg1_binding = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%d0, %d1} + %arg2_binding = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%d0, %d1} + %arg3_binding = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%d0, %d1} %arg0 = flow.dispatch.tensor.load %arg0_binding, offsets = [0, 0], sizes = [%d0, %d1], strides = [1, 1] : !flow.dispatch.tensor>{%d0, %d1} -> tensor %arg1 = flow.dispatch.tensor.load %arg1_binding, offsets = [0, 0], sizes = [%d0, %d1], strides = [1, 1] @@ -2511,12 +2790,12 @@ func.func @micro_kernel_op() { return } // CHECK-LABEL: func @micro_kernel_op() -// CHECK-DAG: %[[S0:.+]] = hal.interface.constant.load[2] -// CHECK-DAG: %[[S1:.+]] = hal.interface.constant.load[3] -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(uniform_buffer) : memref> -// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(uniform_buffer) : memref> -// CHECK-DAG: %[[ARG2:.+]] = hal.interface.binding.subspan set(0) binding(2) type(uniform_buffer) : memref> -// CHECK-DAG: %[[ARG3:.+]] = hal.interface.binding.subspan set(0) binding(3) type(uniform_buffer) : memref> +// CHECK-DAG: %[[S0:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) +// CHECK-DAG: %[[S1:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(3) +// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref> +// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref> +// CHECK-DAG: %[[ARG2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) : memref> +// CHECK-DAG: %[[ARG3:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) : memref> // CHECK: iree_codegen.ukernel.generic "foo" // CHECK-SAME: ins(%[[ARG0]] : // CHECK-SAME: outs(%[[ARG1]], %[[ARG2]] : @@ -2525,11 +2804,17 @@ func.func @micro_kernel_op() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @sub_byte_bufferize_with_offset() { %c64 = arith.constant 64 : index %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c64) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c64) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %2 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] %3 = flow.dispatch.tensor.load %1, offsets = [%2], sizes = [64], strides = [1] : !flow.dispatch.tensor> -> tensor<64xf32> @@ -2545,7 +2830,7 @@ func.func @sub_byte_bufferize_with_offset() { } // CHECK-LABEL: func.func @sub_byte_bufferize_with_offset() // CHECK: %[[C64:.+]] = arith.constant 64 : index -// CHECK: hal.interface.binding.subspan set(0) binding(0) +// CHECK: hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: memref<64xi4, strided<[1], offset: 128> // ----- diff --git a/compiler/src/iree/compiler/Codegen/Common/test/iree_expand_strided_metadata.mlir b/compiler/src/iree/compiler/Codegen/Common/test/iree_expand_strided_metadata.mlir index 6c7e88cc5c877..ae710180c9958 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/iree_expand_strided_metadata.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/iree_expand_strided_metadata.mlir @@ -64,9 +64,14 @@ func.func @resolve_subview_rankreducing_not_at_the_end_memref(%arg0: memref<8x16 // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @resolve_binding_subspan_zero_offset_memref() -> (memref, index, index, index, index, index) { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<512x384xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<512x384xf32> %base_buffer, %offset, %sizes:2, %strides:2 = memref.extract_strided_metadata %0 : memref<512x384xf32> -> memref, index, index, index, index, index return %base_buffer, %offset, %sizes#0, %sizes#1, %strides#0, %strides#1 : memref, index, index, index, index, index } @@ -75,14 +80,19 @@ func.func @resolve_binding_subspan_zero_offset_memref() -> (memref, index, // CHECK-DAG: %[[C384:.+]] = arith.constant 384 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK: %[[BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref<196608xf32> +// CHECK: %[[BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%[[C0]]) : memref<196608xf32> // CHECK: %[[BASE_PTR:.+]] = memref.reinterpret_cast %[[BINDING]] to offset: [0], sizes: [], strides: [] // CHECK: return %[[BASE_PTR]], %[[C0]], %[[C512]], %[[C384]], %[[C384]], %[[C1]] // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @resolve_binding_subspan_offset_index_memref(%arg0 : index) -> (memref, index, index, index, index, index) { - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%arg0) : memref<512x384xindex, strided<[384, 1], offset:?>> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%arg0) : memref<512x384xindex, strided<[384, 1], offset:?>> %base_buffer, %offset, %sizes:2, %strides:2 = memref.extract_strided_metadata %0 : memref<512x384xindex, strided<[384, 1], offset:?>> -> memref, index, index, index, index, index return %base_buffer, %offset, %sizes#0, %sizes#1, %strides#0, %strides#1 : memref, index, index, index, index, index } @@ -96,15 +106,20 @@ func.func @resolve_binding_subspan_offset_index_memref(%arg0 : index) -> (memref // CHECK: %[[SIZEOF:.+]] = util.sizeof index // CHECK: %[[OFFSET:.+]] = affine.apply #[[MAP0]]()[%arg0, %[[SIZEOF]]] // CHECK: %[[SUBSPAN_SIZE:.+]] = affine.apply #[[MAP1]]()[%arg0, %[[SIZEOF]]] -// CHECK: %[[BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref{%[[SUBSPAN_SIZE]]} +// CHECK: %[[BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%[[C0]]) : memref{%[[SUBSPAN_SIZE]]} // CHECK: %[[BASE_PTR:.+]] = memref.reinterpret_cast %[[BINDING]] to offset: [0], sizes: [], strides: [] // CHECK: return %[[BASE_PTR]], %[[OFFSET]], %[[C512]], %[[C384]], %[[C384]], %[[C1]] // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @resolve_binding_subspan_dyn_dims_memref(%arg0 : index, %arg1 : index) -> (memref, index, index, index, index, index) { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref{%arg0, %arg1} + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref{%arg0, %arg1} %base_buffer, %offset, %sizes:2, %strides:2 = memref.extract_strided_metadata %0 : memref -> memref, index, index, index, index, index return %base_buffer, %offset, %sizes#0, %sizes#1, %strides#0, %strides#1 : memref, index, index, index, index, index } @@ -113,7 +128,7 @@ func.func @resolve_binding_subspan_dyn_dims_memref(%arg0 : index, %arg1 : index) // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[SIZE:.+]] = affine.apply #[[MAP]]()[%arg0, %arg1] -// CHECK: %[[BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref{%[[SIZE]]} +// CHECK: %[[BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%[[C0]]) : memref{%[[SIZE]]} // CHECK: %[[BASE_PTR:.+]] = memref.reinterpret_cast %[[BINDING]] to offset: [0], sizes: [], strides: [] // CHECK: return %[[BASE_PTR]], %[[C0]], %arg0, %arg1, %arg1, %[[C1]] @@ -169,11 +184,17 @@ func.func @resolve_global_memref() -> (memref, index, index, index, index, // ----- -// Test for the part of the pass that converts iree_codegen to memref +// Tests for the part of the pass that converts iree_codegen to memref. + +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @external_func_entry_point() -> (memref, index) { - %0 = hal.interface.constant.load[0] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = arith.index_castui %0 : i32 to index - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%1) flags(ReadOnly) : memref<1x8x768xbf16, strided<[6144, 768, 1], offset: ?>> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%1) flags(ReadOnly) : memref<1x8x768xbf16, strided<[6144, 768, 1], offset: ?>> %base_buffer, %offset, %sizes:3, %strides:3 = iree_codegen.extract_strided_metadata %2 : memref<1x8x768xbf16, strided<[6144, 768, 1], offset: ?>> -> memref, index, index, index, index, index, index, index return %base_buffer, %offset : memref, index } diff --git a/compiler/src/iree/compiler/Codegen/Common/test/materialize_encoding_into_nop.mlir b/compiler/src/iree/compiler/Codegen/Common/test/materialize_encoding_into_nop.mlir index 8e360eefd077c..ee5423168a148 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/materialize_encoding_into_nop.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/materialize_encoding_into_nop.mlir @@ -201,30 +201,42 @@ func.func @batch_matmul_fill_dynamic(%arg0 : tensor, %arg1 : tensor, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @drop_encoding_for_hal_flow_ops_static() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor, matmul_narrow_M = 1 : index, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor, matmul_narrow_M = 1 : index, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1xf32> %3 = iree_encoding.set_encoding %2 : tensor<1x1xf32> -> tensor<1x1xf32, #iree_encoding.encoding, matmul_narrow_M = 1 : index, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32, #iree_encoding.encoding, matmul_narrow_M = 1 : index, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor, matmul_narrow_M = 1 : index, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>> return } // CHECK-LABEL: func.func @drop_encoding_for_hal_flow_ops_static -// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan set(0) binding(0) {{.+}} : !flow.dispatch.tensor> -// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(1) {{.+}} : !flow.dispatch.tensor> +// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) {{.+}} : !flow.dispatch.tensor> +// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) {{.+}} : !flow.dispatch.tensor> // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[IN]] // CHECK: flow.dispatch.tensor.store %[[LOAD]], %[[OUT]] // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @drop_encoding_for_hal_flow_ops_dynamic() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 %4 = arith.extui %0 : i32 to i64 %5 = arith.extui %1 : i32 to i64 %6 = arith.shli %5, %c32_i64 : i64 @@ -237,15 +249,15 @@ func.func @drop_encoding_for_hal_flow_ops_dynamic() { %13 = arith.index_castui %12 : i64 to index %14 = flow.dispatch.workload.ordinal %8, 0 : index %15 = flow.dispatch.workload.ordinal %13, 1 : index - %16 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%14, %15} - %17 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%14, %15} + %16 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%14, %15} + %17 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%14, %15} %18 = flow.dispatch.tensor.load %16, offsets = [0, 0], sizes = [%14, %15], strides = [1, 1] : !flow.dispatch.tensor>{%14, %15} -> tensor %19 = iree_encoding.set_encoding %18 : tensor -> tensor, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> flow.dispatch.tensor.store %19, %17, offsets = [0, 0], sizes = [%14, %15], strides = [1, 1] : tensor, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%14, %15} return } // CHECK-LABEL: func.func @drop_encoding_for_hal_flow_ops_dynamic -// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan set(0) binding(0) {{.+}} : !flow.dispatch.tensor> -// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(1) {{.+}} : !flow.dispatch.tensor> +// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) {{.+}} : !flow.dispatch.tensor> +// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) {{.+}} : !flow.dispatch.tensor> // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[IN]] // CHECK: flow.dispatch.tensor.store %[[LOAD]], %[[OUT]] diff --git a/compiler/src/iree/compiler/Codegen/Common/test/materialize_user_configs.mlir b/compiler/src/iree/compiler/Codegen/Common/test/materialize_user_configs.mlir index 2a6ff51cbad8a..8e80747233c0c 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/materialize_user_configs.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/materialize_user_configs.mlir @@ -3,13 +3,20 @@ #config = #iree_codegen.lowering_config #executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {target_triple = "x86_64-xyz-xyz"}> #translation = #iree_codegen.translation_info +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #compilation = #iree_codegen.compilation_info module { func.func @preset_config() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x512xf32> %5 = tensor.empty() : tensor<128x512xf32> diff --git a/compiler/src/iree/compiler/Codegen/Common/test/optimize_tensor_insert_extract_slices.mlir b/compiler/src/iree/compiler/Codegen/Common/test/optimize_tensor_insert_extract_slices.mlir index 2cf5726e84baf..b07b2b5335275 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/optimize_tensor_insert_extract_slices.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/optimize_tensor_insert_extract_slices.mlir @@ -63,6 +63,13 @@ func.func @fold_extract_slice_consumer_into_xfer_write_3(%arg0: vector<1x64x128x // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<()[s0] -> (s0 * 64)> #map1 = affine_map<()[s0] -> (s0 * 128)> #map2 = affine_map<()[s0] -> (s0 * -64 + 968, 64)> @@ -75,7 +82,7 @@ func.func @batch_matmul_with_padding_strategy(%arg0: tensor<1x?x1280xf16>, %arg1 %c1 = arith.constant 1 : index %cst_0 = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_z = hal.interface.workgroup.id[2] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %1 = affine.apply #map()[%workgroup_id_y] @@ -112,13 +119,19 @@ func.func @batch_matmul_with_padding_strategy(%arg0: tensor<1x?x1280xf16>, %arg1 // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @_batch_matmul_narrow_n_2_dispatch_4_unpack_i32() attributes {translation_info = #iree_codegen.translation_info} { %c0_i32 = arith.constant 0 : i32 %c2 = arith.constant 2 : index %c128 = arith.constant 128 : index %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c128) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c128) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index scf.for %arg0 = %workgroup_id_x to %c2 step %workgroup_count_x { diff --git a/compiler/src/iree/compiler/Codegen/Common/test/remove_dead_allocs.mlir b/compiler/src/iree/compiler/Codegen/Common/test/remove_dead_allocs.mlir index 68655d7ec03ef..6607225f0ad17 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/remove_dead_allocs.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/remove_dead_allocs.mlir @@ -19,8 +19,13 @@ func.func @alloc_keep(%arg0: index, %arg1: index) -> memref { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @cleanup_only_assume_alignment_uses() { - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<42xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<42xf32> memref.assume_alignment %0, 64 : memref<42xf32> return } diff --git a/compiler/src/iree/compiler/Codegen/Common/test/remove_trivial_loops.mlir b/compiler/src/iree/compiler/Codegen/Common/test/remove_trivial_loops.mlir index 84ab767020692..4fa8f0a4413b3 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/remove_trivial_loops.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/remove_trivial_loops.mlir @@ -206,11 +206,11 @@ hal.executable private @simple_mul { %cst = arith.constant 0.000000e+00 : f32 %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<4xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<4xf32> memref.assume_alignment %0, 64 : memref<4xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<4xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<4xf32> memref.assume_alignment %1, 64 : memref<4xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<4xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<4xf32> memref.assume_alignment %2, 64 : memref<4xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir index d5738c8bc4b04..42e5487db5fcd 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir @@ -2,7 +2,7 @@ // RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-tile-and-distribute-to-workgroups{max-workgroup-parallel-dims=1}, canonicalize)), cse)))' --split-input-file %s | FileCheck %s -check-prefix=CHECKW // RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-tile-and-distribute-to-workgroups{distribution-method=2})), canonicalize, cse)))' --split-input-file %s | FileCheck %s -check-prefix=NO-LOOP #config = #iree_codegen.lowering_config -#pipeline_layout = #hal.pipeline.layout, #hal.descriptor_set.binding<1, storage_buffer>, @@ -25,19 +25,19 @@ hal.executable private @matmul_tensors { } builtin.module { func.func @matmul_tensors() attributes {translation_info = #translation} { - %cl_0 = hal.interface.constant.load[0] : index - %cl_1 = hal.interface.constant.load[1] : index - %cl_2 = hal.interface.constant.load[2] : index + %cl_0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %cl_1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %cl_2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = flow.dispatch.workload.ordinal %cl_0, 0 : index %1 = flow.dispatch.workload.ordinal %cl_1, 1 : index %2 = flow.dispatch.workload.ordinal %cl_2, 2 : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} - %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%0, %1} %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor @@ -69,13 +69,13 @@ hal.executable private @matmul_tensors { // CHECK: hal.return %[[D1]], %[[D0]], %[[C1]] : index, index, index // CHECK: func.func @matmul_tensors() // CHECK-SAME: translation_info = #[[TRANSLATION]] -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] -// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] -// CHECK-DAG: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) -// CHECK-DAG: %[[INIT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) -// CHECK-DAG: %[[OUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(3) +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) +// CHECK-DAG: %[[N:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) +// CHECK-DAG: %[[K:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) +// CHECK-DAG: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[INIT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) +// CHECK-DAG: %[[OUT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) // CHECK-DAG: %[[WG_ID_X:.+]] = hal.interface.workgroup.id[0] // CHECK-DAG: %[[WG_COUNT_X:.+]] = hal.interface.workgroup.count[0] // CHECK-DAG: %[[WG_ID_Y:.+]] = hal.interface.workgroup.id[1] @@ -100,7 +100,7 @@ hal.executable private @matmul_tensors { // ----- #config = #iree_codegen.lowering_config -#pipeline_layout = #hal.pipeline.layout, #hal.descriptor_set.binding<1, storage_buffer>, @@ -124,15 +124,15 @@ hal.executable private @add { } builtin.module { func.func @add() attributes {translation_info = #translation} { - %cl_0 = hal.interface.constant.load[0] : index - %cl_1 = hal.interface.constant.load[1] : index + %cl_0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %cl_1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %0 = flow.dispatch.workload.ordinal %cl_0, 0 : index %1 = flow.dispatch.workload.ordinal %cl_1, 1 : index - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %1} - %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%1} - %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor @@ -175,12 +175,11 @@ hal.executable private @add { // ----- #config = #iree_codegen.lowering_config -#pipeline_layout = #hal.pipeline.layout, #hal.descriptor_set.binding<1, storage_buffer>, - #hal.descriptor_set.binding<2, storage_buffer>, - #hal.descriptor_set.binding<3, storage_buffer> + #hal.descriptor_set.binding<2, storage_buffer> ]> ]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", { @@ -198,19 +197,19 @@ hal.executable private @add4D { } builtin.module { func.func @add4D() attributes {translation_info = #translation} { - %cl_0 = hal.interface.constant.load[0] : index - %cl_1 = hal.interface.constant.load[1] : index - %cl_2 = hal.interface.constant.load[2] : index - %cl_3 = hal.interface.constant.load[3] : index + %cl_0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %cl_1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %cl_2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %cl_3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index %0 = flow.dispatch.workload.ordinal %cl_0, 0 : index %1 = flow.dispatch.workload.ordinal %cl_1, 1 : index %2 = flow.dispatch.workload.ordinal %cl_2, 2 : index %3 = flow.dispatch.workload.ordinal %cl_3, 3 : index - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) : !flow.dispatch.tensor>{%0, %1, %2, %3} - %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) : !flow.dispatch.tensor>{%0, %1, %2, %3} - %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(32) : !flow.dispatch.tensor>{%0, %1, %2, %3} %7 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, %2, %3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%0, %1, %2, %3} -> tensor @@ -255,12 +254,11 @@ hal.executable private @add4D { // ----- #config = #iree_codegen.lowering_config -#pipeline_layout = #hal.pipeline.layout, #hal.descriptor_set.binding<1, storage_buffer>, - #hal.descriptor_set.binding<2, storage_buffer>, - #hal.descriptor_set.binding<3, storage_buffer> + #hal.descriptor_set.binding<2, storage_buffer> ]> ]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", { @@ -278,19 +276,19 @@ hal.executable private @add_distribute4D { } builtin.module { func.func @add_distribute4D() attributes {translation_info = #translation} { - %cl_0 = hal.interface.constant.load[0] : index - %cl_1 = hal.interface.constant.load[1] : index - %cl_2 = hal.interface.constant.load[2] : index - %cl_3 = hal.interface.constant.load[3] : index + %cl_0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %cl_1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %cl_2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %cl_3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index %0 = flow.dispatch.workload.ordinal %cl_0, 0 : index %1 = flow.dispatch.workload.ordinal %cl_1, 1 : index %2 = flow.dispatch.workload.ordinal %cl_2, 2 : index %3 = flow.dispatch.workload.ordinal %cl_3, 3 : index - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) : !flow.dispatch.tensor>{%0, %1, %2, %3} - %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) : !flow.dispatch.tensor>{%0, %1, %2, %3} - %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(32) : !flow.dispatch.tensor>{%0, %1, %2, %3} %7 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, %2, %3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%0, %1, %2, %3} -> tensor @@ -335,13 +333,13 @@ hal.executable private @add_distribute4D { // CHECK: hal.return %[[D2]], %[[D1]], %[[D0]] : index, index, index // CHECK: func.func @add_distribute4D() // CHECK-SAME: translation_info = #[[TRANSLATION]] -// CHECK-DAG: %[[D0:.*]] = hal.interface.constant.load[0] : index -// CHECK-DAG: %[[D1:.*]] = hal.interface.constant.load[1] : index -// CHECK-DAG: %[[D2:.*]] = hal.interface.constant.load[2] : index -// CHECK-DAG: %[[D3:.*]] = hal.interface.constant.load[3] : index -// CHECK-DAG: %[[D4:.*]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) : !flow.dispatch.tensor>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]} -// CHECK-DAG: %[[D5:.*]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) : !flow.dispatch.tensor>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]} -// CHECK-DAG: %[[D6:.*]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) : !flow.dispatch.tensor>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]} +// CHECK-DAG: %[[D0:.*]] = hal.interface.constant.load layout({{.+}}) ordinal(0) : index +// CHECK-DAG: %[[D1:.*]] = hal.interface.constant.load layout({{.+}}) ordinal(1) : index +// CHECK-DAG: %[[D2:.*]] = hal.interface.constant.load layout({{.+}}) ordinal(2) : index +// CHECK-DAG: %[[D3:.*]] = hal.interface.constant.load layout({{.+}}) ordinal(3) : index +// CHECK-DAG: %[[D4:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(32) : !flow.dispatch.tensor>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]} +// CHECK-DAG: %[[D5:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(32) : !flow.dispatch.tensor>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]} +// CHECK-DAG: %[[D6:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) alignment(32) : !flow.dispatch.tensor>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]} // CHECK: %[[WORKGROUP_ID_X:.*]] = hal.interface.workgroup.id[0] : index // CHECK: %[[WORKGROUP_COUNT_X:.*]] = hal.interface.workgroup.count[0] : index // CHECK: %[[WORKGROUP_ID_Y:.*]] = hal.interface.workgroup.id[1] : index @@ -376,12 +374,11 @@ hal.executable private @add_distribute4D { // ----- #config = #iree_codegen.lowering_config -#pipeline_layout = #hal.pipeline.layout, #hal.descriptor_set.binding<1, storage_buffer>, - #hal.descriptor_set.binding<2, storage_buffer>, - #hal.descriptor_set.binding<3, storage_buffer> + #hal.descriptor_set.binding<2, storage_buffer> ]> ]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", { @@ -399,19 +396,19 @@ hal.executable private @add_distribute4D_zero_tile_size { } builtin.module { func.func @add_distribute4D_zero_tile_size() attributes {translation_info = #translation} { - %cl_0 = hal.interface.constant.load[0] : index - %cl_1 = hal.interface.constant.load[1] : index - %cl_2 = hal.interface.constant.load[2] : index - %cl_3 = hal.interface.constant.load[3] : index + %cl_0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %cl_1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %cl_2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %cl_3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index %0 = flow.dispatch.workload.ordinal %cl_0, 0 : index %1 = flow.dispatch.workload.ordinal %cl_1, 1 : index %2 = flow.dispatch.workload.ordinal %cl_2, 2 : index %3 = flow.dispatch.workload.ordinal %cl_3, 3 : index - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) : !flow.dispatch.tensor>{%0, %1, %2, %3} - %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) : !flow.dispatch.tensor>{%0, %1, %2, %3} - %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(32) : !flow.dispatch.tensor>{%0, %1, %2, %3} %7 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, %2, %3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%0, %1, %2, %3} -> tensor @@ -452,7 +449,7 @@ hal.executable private @add_distribute4D_zero_tile_size { // ----- #config = #iree_codegen.lowering_config -#pipeline_layout = #hal.pipeline.layout, #hal.descriptor_set.binding<1, storage_buffer>, @@ -474,19 +471,19 @@ hal.executable private @batch_matmul_tensors { builtin.module { func.func @batch_matmul_tensors() attributes {translation_info = #translation} { %cst = arith.constant 0.000000e+00 : f32 - %cl_0 = hal.interface.constant.load[0] : index - %cl_1 = hal.interface.constant.load[1] : index - %cl_2 = hal.interface.constant.load[2] : index - %cl_3 = hal.interface.constant.load[3] : index + %cl_0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %cl_1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %cl_2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %cl_3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index %0 = flow.dispatch.workload.ordinal %cl_0, 0 : index %1 = flow.dispatch.workload.ordinal %cl_1, 1 : index %2 = flow.dispatch.workload.ordinal %cl_2, 2 : index %3 = flow.dispatch.workload.ordinal %cl_3, 3 : index - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) : !flow.dispatch.tensor>{%0, %1, %3} - %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) : !flow.dispatch.tensor>{%0, %3, %2} - %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(32) : !flow.dispatch.tensor>{%0, %1, %2} %7 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0], sizes = [%0, %1, %3], strides = [1, 1, 1] : !flow.dispatch.tensor>{%0, %1, %3} -> tensor @@ -545,11 +542,11 @@ hal.executable private @preset_config_matmul_tensors { builtin.module { func.func @preset_config() attributes {translation_info = #translation} { %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf32> @@ -590,7 +587,7 @@ hal.executable private @preset_config_matmul_tensors { // ----- #config = #iree_codegen.lowering_config -#pipeline_layout = #hal.pipeline.layout, #hal.descriptor_set.binding<1, storage_buffer> @@ -607,16 +604,16 @@ hal.executable public @copy_op { } builtin.module { func.func @copy_op() attributes {translation_info = #translation} { - %cl_0 = hal.interface.constant.load[0] : index - %cl_1 = hal.interface.constant.load[1] : index - %cl_2 = hal.interface.constant.load[2] : index - %cl_3 = hal.interface.constant.load[3] : index - %cl_4 = hal.interface.constant.load[4] : index - %cl_5 = hal.interface.constant.load[5] : index - %cl_6 = hal.interface.constant.load[6] : index - %cl_7 = hal.interface.constant.load[7] : index - %cl_8 = hal.interface.constant.load[8] : index - %cl_9 = hal.interface.constant.load[9] : index + %cl_0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %cl_1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %cl_2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %cl_3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %cl_4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : index + %cl_5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : index + %cl_6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : index + %cl_7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : index + %cl_8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : index + %cl_9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : index %source_size_y = flow.dispatch.workload.ordinal %cl_0, 0: index %source_size_x = flow.dispatch.workload.ordinal %cl_1, 1: index %dest_size_y = flow.dispatch.workload.ordinal %cl_2, 2: index @@ -627,8 +624,8 @@ hal.executable public @copy_op { %dest_offset_x = flow.dispatch.workload.ordinal %cl_7, 7: index %slice_size_y = flow.dispatch.workload.ordinal %cl_8, 8: index %slice_size_x = flow.dispatch.workload.ordinal %cl_9, 9: index - %source = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref{%source_size_y, %source_size_x} - %dest = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref{%dest_size_y, %dest_size_x} + %source = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref{%source_size_y, %source_size_x} + %dest = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref{%dest_size_y, %dest_size_x} %source_subview = memref.subview %source[%source_offset_y, %source_offset_x] [%slice_size_y, %slice_size_x] [1, 1] : memref to memref> %dest_subview = memref.subview %dest[%dest_offset_y, %dest_offset_x] [%slice_size_y, %slice_size_x] [1, 1] : memref to memref> linalg.generic { @@ -656,18 +653,18 @@ hal.executable public @copy_op { // CHECK: hal.return %[[D1]], %[[D0]], %[[C1]] // CHECK: func.func @copy_op() // CHECK-SAME: translation_info = #[[TRANSLATION]] -// CHECK-DAG: %[[SOURCE_SIZE_Y:.+]] = hal.interface.constant.load[0] : index -// CHECK-DAG: %[[SOURCE_SIZE_X:.+]] = hal.interface.constant.load[1] : index -// CHECK-DAG: %[[DEST_SIZE_Y:.+]] = hal.interface.constant.load[2] : index -// CHECK-DAG: %[[DEST_SIZE_X:.+]] = hal.interface.constant.load[3] : index -// CHECK-DAG: %[[SOURCE_OFFSET_Y:.+]] = hal.interface.constant.load[4] : index -// CHECK-DAG: %[[SOURCE_OFFSET_X:.+]] = hal.interface.constant.load[5] : index -// CHECK-DAG: %[[DEST_OFFSET_Y:.+]] = hal.interface.constant.load[6] : index -// CHECK-DAG: %[[DEST_OFFSET_X:.+]] = hal.interface.constant.load[7] : index -// CHECK-DAG: %[[SLICE_SIZE_Y:.+]] = hal.interface.constant.load[8] : index -// CHECK-DAG: %[[SLICE_SIZE_X:.+]] = hal.interface.constant.load[9] : index -// CHECK-DAG: %[[SOURCE_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[DEST_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[SOURCE_SIZE_Y:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) : index +// CHECK-DAG: %[[SOURCE_SIZE_X:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) : index +// CHECK-DAG: %[[DEST_SIZE_Y:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(2) : index +// CHECK-DAG: %[[DEST_SIZE_X:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(3) : index +// CHECK-DAG: %[[SOURCE_OFFSET_Y:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(4) : index +// CHECK-DAG: %[[SOURCE_OFFSET_X:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(5) : index +// CHECK-DAG: %[[DEST_OFFSET_Y:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(6) : index +// CHECK-DAG: %[[DEST_OFFSET_X:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(7) : index +// CHECK-DAG: %[[SLICE_SIZE_Y:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(8) : index +// CHECK-DAG: %[[SLICE_SIZE_X:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(9) : index +// CHECK-DAG: %[[SOURCE_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[DEST_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[SOURCE:.+]] = memref.subview %[[SOURCE_BINDING]][%[[SOURCE_OFFSET_Y]], %[[SOURCE_OFFSET_X]]] // CHECK-DAG: %[[DEST:.+]] = memref.subview %[[DEST_BINDING]][%[[DEST_OFFSET_Y]], %[[DEST_OFFSET_X]]] // CHECK-DAG: %[[WG_ID_X:.+]] = hal.interface.workgroup.id[0] @@ -711,9 +708,9 @@ hal.executable private @static_1d_fft_stage2 { %c2 = arith.constant 2 : index %cst = arith.constant dense<[1.000000e+00, 6.12323426E-17]> : tensor<2xf32> %cst_0 = arith.constant dense<[-0.000000e+00, -1.000000e+00]> : tensor<2xf32> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xf32> @@ -768,8 +765,8 @@ hal.executable private @static_3d_fft_stage3 { %cst_0 = arith.constant dense<[-0.000000e+00, -0.707106769, -1.000000e+00, -0.707106769]> : tensor<4xf32> %0 = bufferization.to_memref %cst_0 : memref<4xf32> %1 = bufferization.to_memref %cst : memref<4xf32> - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x128x32xf32> - %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<64x128x32xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<64x128x32xf32> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<64x128x32xf32> iree_linalg_ext.fft {lowering_config = #config} ins(%c3, %1, %0 : index, memref<4xf32>, memref<4xf32>) outs(%2, %3 : memref<64x128x32xf32>, memref<64x128x32xf32>) return @@ -797,7 +794,7 @@ hal.executable private @static_3d_fft_stage3 { // ----- #config = #iree_codegen.lowering_config -#pipeline_layout = #hal.pipeline.layout, #hal.descriptor_set.binding<1, storage_buffer>, @@ -820,17 +817,17 @@ hal.executable private @outs_fusion { builtin.module { func.func @outs_fusion_fn() attributes {translation_info = #translation} { %cst = arith.constant 0.000000e+00 : f32 - %cl_0 = hal.interface.constant.load[0] : index - %cl_1 = hal.interface.constant.load[1] : index - %cl_2 = hal.interface.constant.load[2] : index + %cl_0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %cl_1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %cl_2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = flow.dispatch.workload.ordinal %cl_0, 0 : index %1 = flow.dispatch.workload.ordinal %cl_1, 1 : index %2 = flow.dispatch.workload.ordinal %cl_2, 2 : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} %6 = tensor.empty(%0, %1) : tensor %7 = linalg.generic { @@ -882,7 +879,7 @@ hal.executable private @outs_fusion { // ----- #config = #iree_codegen.lowering_config -#pipeline_layout = #hal.pipeline.layout, #hal.descriptor_set.binding<1, storage_buffer>, @@ -903,15 +900,15 @@ hal.executable private @conv { } builtin.module { func.func @conv() attributes {translation_info = #translation} { - %cl_0 = hal.interface.constant.load[0] : index - %cl_1 = hal.interface.constant.load[1] : index - %cl_2 = hal.interface.constant.load[2] : index - %cl_3 = hal.interface.constant.load[3] : index - %cl_4 = hal.interface.constant.load[4] : index - %cl_5 = hal.interface.constant.load[5] : index - %cl_6 = hal.interface.constant.load[6] : index - %cl_7 = hal.interface.constant.load[7] : index - %cl_8 = hal.interface.constant.load[8] : index + %cl_0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %cl_1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %cl_2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %cl_3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %cl_4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : index + %cl_5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : index + %cl_6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : index + %cl_7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : index + %cl_8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : index %0 = flow.dispatch.workload.ordinal %cl_0, 0 : index %1 = flow.dispatch.workload.ordinal %cl_1, 1 : index %2 = flow.dispatch.workload.ordinal %cl_2, 2 : index @@ -921,11 +918,11 @@ hal.executable private @conv { %6 = flow.dispatch.workload.ordinal %cl_6, 6 : index %7 = flow.dispatch.workload.ordinal %cl_7, 7 : index %8 = flow.dispatch.workload.ordinal %cl_8, 8 : index - %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %1, %2, %3} - %10 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) + %10 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%4, %5, %3, %6} - %11 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) + %11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %7, %8, %6} %12 = flow.dispatch.tensor.load %9, offsets = [0, 0, 0, 0], sizes = [%0, %1, %2, %3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%0, %1, %2, %3} -> tensor @@ -994,11 +991,11 @@ hal.executable private @conv_static { builtin.module { func.func @conv_static() attributes {translation_info = #translation} { %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 161, 161, 96], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x161x161x96xf32> @@ -1068,9 +1065,9 @@ hal.executable private @generic_static { } builtin.module { func.func @generic_static() attributes {translation_info = #translation} { - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [96, 16], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<96x16xf32> @@ -1135,11 +1132,11 @@ hal.executable private @matmul_static { builtin.module { func.func @matmul_static() attributes {translation_info = #translation} { %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [196, 240], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<196x240xf32> @@ -1190,11 +1187,11 @@ hal.executable private @restrict_num_workgroups { builtin.module { func.func @restrict_num_workgroups() attributes {translation_info = #translation} { %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 11, 11, 576], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x11x11x576xf32> @@ -1295,7 +1292,7 @@ hal.executable private @reduction { // ----- #config = #iree_codegen.lowering_config -#pipeline_layout = #hal.pipeline.layout, #hal.descriptor_set.binding<1, storage_buffer>, @@ -1317,15 +1314,15 @@ hal.executable private @gemm_unit_N { builtin.module { func.func @gemm_unit_N() attributes {translation_info = #translation} { %c0 = arith.constant 0 : index - %cl_0 = hal.interface.constant.load[0] : index - %cl_1 = hal.interface.constant.load[1] : index + %cl_0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %cl_1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %0 = flow.dispatch.workload.ordinal %cl_0, 0 : index %1 = flow.dispatch.workload.ordinal %cl_1, 1 : index - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0, %1} - %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%1} - %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0) + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0} %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%1, 1], strides = [1, 1] : !flow.dispatch.tensor>{%1} -> tensor @@ -1354,7 +1351,7 @@ hal.executable private @gemm_unit_N { // CHECK: hal.return %[[D0]], %[[C1]], %[[C1]] : index, index, index // CHECK: func.func @gemm_unit_N() // CHECK-SAME: translation_info = #[[TRANSLATION]] -// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] +// CHECK-DAG: %[[M:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) // CHECK-DAG: %[[WG_ID_X:.+]] = hal.interface.workgroup.id[0] // CHECK-DAG: %[[WG_COUNT_X:.+]] = hal.interface.workgroup.count[0] // CHECK-DAG: %[[LB:.+]] = affine.apply #[[MAP1]]()[%[[WG_ID_X]]] @@ -1367,7 +1364,7 @@ hal.executable private @gemm_unit_N { // ----- #config = #iree_codegen.lowering_config -#pipeline_layout = #hal.pipeline.layout, #hal.descriptor_set.binding<1, storage_buffer>, @@ -1389,12 +1386,12 @@ hal.executable private @gemm_unit_M_unit_N { builtin.module { func.func @gemm_unit_M_unit_N() attributes {translation_info = #translation} { %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0} - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0} - %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0) + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(32) offset(%c0) : !flow.dispatch.tensor> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, %0], strides = [1, 1] : !flow.dispatch.tensor>{%0} -> tensor<1x?xf32> @@ -1426,11 +1423,10 @@ hal.executable private @gemm_unit_M_unit_N { // ----- #config = #iree_codegen.lowering_config -#pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer>, - #hal.descriptor_set.binding<2, storage_buffer> + #hal.descriptor_set.binding<1, storage_buffer> ]> ]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", { @@ -1448,17 +1444,17 @@ hal.executable private @generic_unit_dims { } builtin.module { func.func @generic_unit_dims() attributes {translation_info = #translation} { - %cl_0 = hal.interface.constant.load[0] : index - %cl_1 = hal.interface.constant.load[1] : index - %cl_2 = hal.interface.constant.load[2] : index - %cl_3 = hal.interface.constant.load[3] : index + %cl_0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %cl_1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %cl_2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %cl_3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index %0 = flow.dispatch.workload.ordinal %cl_0, 0 : index %1 = flow.dispatch.workload.ordinal %cl_1, 1 : index %2 = flow.dispatch.workload.ordinal %cl_2, 2 : index %3 = flow.dispatch.workload.ordinal %cl_3, 3 : index - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %1, %2, %3} - %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%0, %1, %2, %3} %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %0, 1, 1, %1, %2, 1, %3], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor>{%0, %1, %2, %3} -> tensor<1x?x1x1x?x?x1x?xf32> @@ -1501,11 +1497,10 @@ hal.executable private @generic_unit_dims { // ----- #config = #iree_codegen.lowering_config -#pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer>, - #hal.descriptor_set.binding<2, storage_buffer> + #hal.descriptor_set.binding<1, storage_buffer> ]> ]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", { @@ -1524,11 +1519,11 @@ hal.executable private @reduce_to_scalar { } builtin.module { func.func @reduce_to_scalar() attributes {translation_info = #translation} { - %cl_0 = hal.interface.constant.load[0] : index + %cl_0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index %0 = flow.dispatch.workload.ordinal %cl_0, 0 : index - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0} - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [%0], strides = [1] : !flow.dispatch.tensor>{%0} -> tensor @@ -1565,8 +1560,7 @@ hal.executable private @reduce_to_scalar { #pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer>, - #hal.descriptor_set.binding<2, storage_buffer> + #hal.descriptor_set.binding<1, storage_buffer> ]> ]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", { @@ -1584,9 +1578,9 @@ hal.executable private @scalar { } builtin.module { func.func @scalar() attributes {translation_info = #translation} { - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor @@ -1641,9 +1635,9 @@ hal.executable private @rank_reduced_slice { } builtin.module { func.func @rank_reduced_slice() attributes {translation_info = #translation} { - %in_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) + %in_binding = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> - %out_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) + %out_binding = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %in = flow.dispatch.tensor.load %in_binding, offsets = [3, 10], sizes = [1, 10], strides = [2, 1] : !flow.dispatch.tensor> -> tensor<10xf32> @@ -1673,9 +1667,9 @@ hal.executable private @rank_reduced_slice { // CHECK: hal.return %[[C5]], %[[C1]], %[[C1]] // CHECK: func.func @rank_reduced_slice() // CHECK-SAME: translation_info = #[[TRANSLATION]] -// CHECK-DAG: %[[SRC_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK-DAG: %[[SRC_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: : !flow.dispatch.tensor> -// CHECK-DAG: %[[DST_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[DST_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: : !flow.dispatch.tensor> // CHECK: scf.for %[[IV0:.+]] = // CHECK: %[[OFFSET:.+]] = affine.apply #[[MAP]]()[%[[IV0]]] @@ -1687,7 +1681,7 @@ hal.executable private @rank_reduced_slice { // ----- #config = #iree_codegen.lowering_config -#pipeline_layout = #hal.pipeline.layout, #hal.descriptor_set.binding<1, storage_buffer>, @@ -1709,19 +1703,19 @@ hal.executable private @matmul_interchange { } builtin.module { func.func @matmul_interchange() attributes {translation_info = #translation} { - %cl_0 = hal.interface.constant.load[0] : index - %cl_1 = hal.interface.constant.load[1] : index - %cl_2 = hal.interface.constant.load[2] : index + %cl_0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %cl_1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %cl_2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = flow.dispatch.workload.ordinal %cl_0, 0 : index %1 = flow.dispatch.workload.ordinal %cl_1, 1 : index %2 = flow.dispatch.workload.ordinal %cl_2, 2 : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} - %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%0, %1} %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor @@ -1752,16 +1746,22 @@ hal.executable private @matmul_interchange { // CHECK: hal.return %[[D0]], %[[D1]], %[[C1]] : index, index, index // CHECK: func.func @matmul_interchange() // CHECK-SAME: translation_info = #[[TRANSLATION]] -// CHECK-DAG: %[[D0:.+]] = hal.interface.constant.load[0] : index -// CHECK-DAG: %[[D1:.+]] = hal.interface.constant.load[1] : index +// CHECK-DAG: %[[D0:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index +// CHECK-DAG: %[[D1:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index // CHECK: scf.for %{{.+}} = %{{.+}} to %[[D1]] step %{{.+}} { // CHECK: scf.for %{{.+}} = %{{.+}} to %[[D0]] step %{{.+}} { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable private @no_compute { hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {}>) { - hal.executable.export public @no_compute ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer>]>]>) { + hal.executable.export public @no_compute ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4 : index, %arg5 : index): %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4, %arg5 hal.return %x, %y, %z : index, index, index @@ -1769,11 +1769,11 @@ hal.executable private @no_compute { builtin.module { func.func @no_compute() attributes {translation_info = #iree_codegen.translation_info} { %c0 = arith.constant 0 : index - %cl_0 = hal.interface.constant.load[0] : i32 - %cl_1 = hal.interface.constant.load[1] : i32 - %cl_2 = hal.interface.constant.load[2] : i32 - %cl_3 = hal.interface.constant.load[3] : i32 - %cl_4 = hal.interface.constant.load[4] : i32 + %cl_0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %cl_1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %cl_2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %cl_3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 + %cl_4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 %0 = arith.index_cast %cl_0 : i32 to index %1 = arith.index_cast %cl_1 : i32 to index %2 = arith.index_cast %cl_2 : i32 to index @@ -1784,9 +1784,9 @@ hal.executable private @no_compute { %7 = flow.dispatch.workload.ordinal %2, 2 : index %8 = flow.dispatch.workload.ordinal %3, 3 : index %9 = flow.dispatch.workload.ordinal %4, 4 : index - %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref{%5, %6, %7} + %10 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref{%5, %6, %7} memref.assume_alignment %10, 64 : memref - %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x?x?xf32>{%8, %9} + %11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<1x?x?xf32>{%8, %9} memref.assume_alignment %11, 64 : memref<1x?x?xf32> return } @@ -1800,11 +1800,17 @@ hal.executable private @no_compute { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> hal.executable private @tile_multiuse_producer { hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf_x86_64", {}>) { - hal.executable.export public @tile_multiuse_producer ordinal(0) layout (#hal.pipeline.layout< - push_constants = 0, sets = [<0, bindings = [ - <0, storage_buffer, ReadOnly>, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]>) { + hal.executable.export public @tile_multiuse_producer ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index @@ -1814,13 +1820,13 @@ hal.executable private @tile_multiuse_producer { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 %cst_0 = arith.constant 1.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %s0 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %s0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %s1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %s1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %s2 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) + %s2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<12x128x128xf32> @@ -1871,10 +1877,10 @@ hal.executable private @tile_multiuse_producer { } } // CHECK-LABEL: func @tile_multiuse_producer() -// CHECK-DAG: %[[SRC_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[RESULT_BINDING0:.+]] = hal.interface.binding.subspan set(0) binding(1) -// CHECK-DAG: %[[RESULT_BINDING1:.+]] = hal.interface.binding.subspan set(0) binding(2) -// CHECK-DAG: %[[RESULT_BINDING2:.+]] = hal.interface.binding.subspan set(0) binding(3) +// CHECK-DAG: %[[SRC_BINDING:.+]] = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) +// CHECK-DAG: %[[RESULT_BINDING0:.+]] = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) +// CHECK-DAG: %[[RESULT_BINDING1:.+]] = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) +// CHECK-DAG: %[[RESULT_BINDING2:.+]] = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) // CHECK: scf.for %[[IV0:.+]] = // CHECK: scf.for %[[IV1:.+]] = // CHECK: %[[SRC:.+]] = flow.dispatch.tensor.load %[[SRC_BINDING]], offsets = [%[[IV0]], %[[IV1]], 0] @@ -1896,11 +1902,17 @@ hal.executable private @tile_multiuse_producer { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> hal.executable private @no_tile { hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {}>) { - hal.executable.export public @no_tile ordinal(0) layout(#hal.pipeline.layout< - push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>, <3, storage_buffer>]>]>) - { + hal.executable.export public @no_tile ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index @@ -1909,10 +1921,10 @@ hal.executable private @no_tile { func.func @no_tile() attributes {translation_info = #iree_codegen.translation_info} { %c0 = arith.constant 0 : index %c64 = arith.constant 64 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c64) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c64) : !flow.dispatch.tensor> %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor> -> tensor<10xf32> %5 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor> -> tensor<10xi32> %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [3], strides = [1] : !flow.dispatch.tensor> -> tensor<3xf32> @@ -1935,12 +1947,15 @@ hal.executable private @no_tile { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable private @pack_lowering { hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {}>) { - hal.executable.export public @gemm_lhs_pack ordinal(0) - layout(#hal.pipeline.layout, <1, storage_buffer>]>]>) - { + hal.executable.export public @gemm_lhs_pack ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index @@ -1949,9 +1964,9 @@ hal.executable private @pack_lowering { func.func @gemm_lhs_pack() attributes {translation_info = #iree_codegen.translation_info} { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [100, 250], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<100x250xf32> @@ -1975,12 +1990,15 @@ hal.executable private @pack_lowering { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable private @pack_lowering { hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {}>) { - hal.executable.export public @gemm_rhs_transpose_pack ordinal(0) - layout(#hal.pipeline.layout, <1, storage_buffer>]>]>) - { + hal.executable.export public @gemm_rhs_transpose_pack ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index @@ -1990,9 +2008,9 @@ hal.executable private @pack_lowering { %c0 = arith.constant 0 : index %c114688 = arith.constant 114688 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c114688) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c114688) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [250, 500], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<250x500xf32> @@ -2015,6 +2033,12 @@ hal.executable private @pack_lowering { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable private @clone_index_computations { hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {}>) { hal.executable.export public @clone_index_computations ordinal(0) layout( @@ -2029,10 +2053,10 @@ hal.executable private @clone_index_computations { func.func @clone_index_computations() attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %cl_0 = hal.interface.constant.load[0] : i32 - %cl_1 = hal.interface.constant.load[1] : i32 - %cl_2 = hal.interface.constant.load[2] : i32 - %cl_3 = hal.interface.constant.load[3] : i32 + %cl_0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %cl_1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %cl_2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %cl_3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 %0 = arith.index_castui %cl_0 : i32 to index %1 = arith.index_castui %cl_1 : i32 to index %2 = arith.index_castui %cl_2 : i32 to index @@ -2041,11 +2065,11 @@ hal.executable private @clone_index_computations { %5 = flow.dispatch.workload.ordinal %1, 1 : index %6 = flow.dispatch.workload.ordinal %2, 2 : index %7 = flow.dispatch.workload.ordinal %3, 3 : index - %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%4, %5} %9 = affine.apply affine_map<()[s0] -> (s0 ceildiv 8)>()[%6] %10 = affine.apply affine_map<()[s0] -> (s0 ceildiv 4)>()[%7] - %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%9, %10} %12 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : !flow.dispatch.tensor>{%4, %5} -> tensor @@ -2078,12 +2102,15 @@ hal.executable private @clone_index_computations { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable private @dynamic_unpack { hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {}>) { - hal.executable.export public @dynamic_unpack ordinal(0) layout( - #hal.pipeline.layout, <1, storage_buffer>]>]>) - { + hal.executable.export public @dynamic_unpack ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index): %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3, %arg4 hal.return %x, %y, %z : index, index, index @@ -2092,10 +2119,10 @@ hal.executable private @dynamic_unpack { func.func @dynamic_unpack() attributes {translation_info = #iree_codegen.translation_info} { %c131072 = arith.constant 131072 : index %c0 = arith.constant 0 : index - %cl_0 = hal.interface.constant.load[0] : i32 - %cl_1 = hal.interface.constant.load[1] : i32 - %cl_2 = hal.interface.constant.load[2] : i32 - %cl_3 = hal.interface.constant.load[3] : i32 + %cl_0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %cl_1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %cl_2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %cl_3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 %0 = arith.index_castui %cl_0 : i32 to index %1 = arith.index_castui %cl_1 : i32 to index %2 = arith.index_castui %cl_2 : i32 to index @@ -2104,8 +2131,8 @@ hal.executable private @dynamic_unpack { %5 = flow.dispatch.workload.ordinal %1, 1 : index %6 = flow.dispatch.workload.ordinal %2, 2 : index %7 = flow.dispatch.workload.ordinal %3, 3 : index - %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%4, %5} - %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c131072) : !flow.dispatch.tensor>{%6, %7} + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%4, %5} + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c131072) : !flow.dispatch.tensor>{%6, %7} %10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 32, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %5} -> tensor %11 = tensor.empty(%6, %7) : tensor %12 = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %11 @@ -2124,6 +2151,12 @@ hal.executable private @dynamic_unpack { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable private @dynamic_unpack_dynamic_tile { hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {}>) { hal.executable.export public @dynamic_unpack_dynamic_tile ordinal(0) layout( @@ -2140,10 +2173,10 @@ hal.executable private @dynamic_unpack_dynamic_tile { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index %c32 = arith.constant 32 : index - %cl_0 = hal.interface.constant.load[0] : i32 - %cl_1 = hal.interface.constant.load[1] : i32 - %cl_2 = hal.interface.constant.load[2] : i32 - %cl_3 = hal.interface.constant.load[3] : i32 + %cl_0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %cl_1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %cl_2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %cl_3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 %0 = arith.index_castui %cl_0 : i32 to index %1 = arith.index_castui %cl_1 : i32 to index %2 = arith.index_castui %cl_2 : i32 to index @@ -2152,8 +2185,8 @@ hal.executable private @dynamic_unpack_dynamic_tile { %5 = flow.dispatch.workload.ordinal %1, 1 : index %6 = flow.dispatch.workload.ordinal %2, 2 : index %7 = flow.dispatch.workload.ordinal %3, 3 : index - %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%4, %5, %c32, %c16} - %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c131072) : !flow.dispatch.tensor>{%6, %7} + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%4, %5, %c32, %c16} + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c131072) : !flow.dispatch.tensor>{%6, %7} %10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, %c32, %c16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %5, %c32, %c16} -> tensor %11 = tensor.empty(%6, %7) : tensor %12 = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [%c32, %c16] into %11 @@ -2172,12 +2205,15 @@ hal.executable private @dynamic_unpack_dynamic_tile { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable private @unpack_elem { hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {}>) { - hal.executable.export public @unpack_elem ordinal(0) layout( - #hal.pipeline.layout, <1, storage_buffer>]>]>) - { + hal.executable.export public @unpack_elem ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index @@ -2185,8 +2221,8 @@ hal.executable private @unpack_elem { builtin.module { func.func @unpack_elem() attributes {translation_info = #iree_codegen.translation_info} { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [16, 48, 8, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x48x8x8xf32> %3 = tensor.empty() : tensor<128x384xf32> %4 = tensor.unpack %2 inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %3 {lowering_config = #iree_codegen.lowering_config} : tensor<16x48x8x8xf32> -> tensor<128x384xf32> @@ -2210,12 +2246,18 @@ hal.executable private @unpack_elem { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> hal.executable private @dynamic_unpack_fusion { hal.executable.variant public @vmvx_bytecode_fb target(<"vmvx", "vmvx-bytecode-fb", {ukernels = true}>) { - hal.executable.export public @dynamic_unpack_fusion ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer>]>]>) { + hal.executable.export public @dynamic_unpack_fusion ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index @@ -2235,10 +2277,10 @@ hal.executable private @dynamic_unpack_fusion { %0:2 = iree_codegen.query_tile_sizes tensor<12544x16xi32, #iree_encoding.encoding> -> index, index %1 = affine.apply affine_map<()[s0] -> (12544 ceildiv s0)>()[%0#0] %2 = affine.apply affine_map<()[s0] -> (16 ceildiv s0)>()[%0#1] - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c200960) flags(ReadOnly) : !flow.dispatch.tensor>{%1, %2, %0#0, %0#1} - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c1003776) flags(ReadOnly) : !flow.dispatch.tensor> - %5 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c1053952) flags(ReadOnly) : !flow.dispatch.tensor> - %6 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c200960) flags(ReadOnly) : !flow.dispatch.tensor>{%1, %2, %0#0, %0#1} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c1003776) flags(ReadOnly) : !flow.dispatch.tensor> + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c1053952) flags(ReadOnly) : !flow.dispatch.tensor> + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %10 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [%1, %2, %0#0, %0#1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%1, %2, %0#0, %0#1} -> tensor %11 = flow.dispatch.tensor.load %4, offsets = [0], sizes = [12544], strides = [1] : !flow.dispatch.tensor> -> tensor<12544xi32> %12 = flow.dispatch.tensor.load %5, offsets = [0], sizes = [16], strides = [1] : !flow.dispatch.tensor> -> tensor<16xi32> @@ -2269,9 +2311,19 @@ hal.executable private @dynamic_unpack_fusion { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer>, + #hal.descriptor_set.binding<5, storage_buffer> + ]> +]> hal.executable private @elem_pack { hal.executable.variant public @embedded_elf_arm_64 target(<"llvm-cpu", "embedded-elf-arm_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>) { - hal.executable.export public @elem_pack ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, ReadOnly>, <2, storage_buffer, ReadOnly>, <3, storage_buffer>, <4, storage_buffer>, <5, storage_buffer>]>]>) { + hal.executable.export public @elem_pack ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index @@ -2286,15 +2338,15 @@ hal.executable private @elem_pack { %c1572864 = arith.constant 1572864 : index %c2359296 = arith.constant 2359296 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c1339392) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c786432) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c823296) flags(ReadOnly) : !flow.dispatch.tensor> - %5 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c825344) flags(ReadOnly) : !flow.dispatch.tensor> - %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %7 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c1572864) : !flow.dispatch.tensor> - %8 = hal.interface.binding.subspan set(0) binding(5) type(storage_buffer) alignment(64) offset(%c2359296) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c1339392) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c786432) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c823296) flags(ReadOnly) : !flow.dispatch.tensor> + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c825344) flags(ReadOnly) : !flow.dispatch.tensor> + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) alignment(64) offset(%c1572864) : !flow.dispatch.tensor> + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(5) alignment(64) offset(%c2359296) : !flow.dispatch.tensor> %9 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 2, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x2x512xf32> %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<384x512xf32> %11 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<384x512xf32> @@ -2333,6 +2385,12 @@ hal.executable private @elem_pack { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable private @scatter { hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { hal.executable.export public @scatter ordinal(0) @@ -2348,9 +2406,9 @@ hal.executable private @scatter { %c251668480 = arith.constant 251668480 : index %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c228075520) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c251668480) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c228075520) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c251668480) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [5898240], strides = [1] : !flow.dispatch.tensor> -> tensor<5898240xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [5898240, 4], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<5898240x4xi32> %5 = tensor.empty() : tensor<1x640x48x48xf32> @@ -2371,9 +2429,15 @@ hal.executable private @scatter { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable private @collapse_workgroups_dispatch_dispatch_0 { hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @collapse_workgroups_dispatch_dispatch_0_generic_1024x128x16x64 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer>]>]>) { + hal.executable.export public @collapse_workgroups_dispatch_dispatch_0_generic_1024x128x16x64 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index @@ -2381,8 +2445,8 @@ hal.executable private @collapse_workgroups_dispatch_dispatch_0 { builtin.module { func.func @collapse_workgroups_dispatch_dispatch_0_generic_1024x128x16x64() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1024, 16, 128, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1024x16x128x64xf32> %3 = tensor.empty() : tensor<1024x128x16x64xf32> %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d2, d1, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<1024x16x128x64xf32>) outs(%3 : tensor<1024x128x16x64xf32>) attrs = {lowering_config = #iree_codegen.lowering_config} { @@ -2398,7 +2462,7 @@ hal.executable private @collapse_workgroups_dispatch_dispatch_0 { // CHECKW-LABEL: hal.executable private @collapse_workgroups_dispatch_dispatch_0 { // CHECKW: hal.executable.variant public @cuda_nvptx_fb -// CHECKW: hal.executable.export public @collapse_workgroups_dispatch_dispatch_0_generic_1024x128x16x64 ordinal(0) layout(#pipeline_layout) { +// CHECKW: hal.executable.export public @collapse_workgroups_dispatch_dispatch_0_generic_1024x128x16x64 ordinal(0) layout({{.+}}) { // CHECKW: ^bb0(%[[ARG0:.*]]: !hal.device): // CHECKW-DAG: %[[C2097152:.*]] = arith.constant 2097152 : index // CHECKW-DAG: %[[C1:.*]] = arith.constant 1 : index @@ -2408,7 +2472,7 @@ hal.executable private @collapse_workgroups_dispatch_dispatch_0 { // ----- #config = #iree_codegen.lowering_config -#pipeline_layout = #hal.pipeline.layout, #hal.descriptor_set.binding<1, storage_buffer>, @@ -2431,16 +2495,16 @@ hal.executable private @matmul_tensors { } builtin.module { func.func @matmul_tensor_count_from_dag_root() attributes {translation_info = #translation} { - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} - %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%0, %1} %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor @@ -2475,7 +2539,14 @@ hal.executable private @matmul_tensors { #config = #iree_codegen.lowering_config #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> #map = affine_map<()[s0] -> (s0 ceildiv 64)> -#pipeline_layout = #hal.pipeline.layout, <1, storage_buffer>, <2, storage_buffer>, <3, storage_buffer>]>]> +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #translation = #iree_codegen.translation_info module { hal.executable private @matmul_tensors { @@ -2489,13 +2560,13 @@ module { } builtin.module { func.func @matmul_already_distributed() attributes {translation_info = #translation} { - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %2} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %1} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%0, %1} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index @@ -2513,9 +2584,9 @@ module { } // CHECK-LABEL: func.func @matmul_already_distributed -// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) -// CHECK: %[[OUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(3) +// CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK: %[[OUT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) // CHECK-NOT: scf.for // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]], offsets = [%workgroup_id_y, 0] // CHECK: %[[RHS:.+]] = flow.dispatch.tensor.load %[[RHS_BINDING]], offsets = [0, %workgroup_id_x] @@ -2526,9 +2597,15 @@ module { // Check that the distribution avoids distributing unit-trip count loops. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable private @avoid_unit_range_distribute { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @avoid_unit_range_distribute ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>], subgroup_size = 32 : index, translation_info = #iree_codegen.translation_info, workgroup_size = [32 : index, 1 : index, 1 : index]} { + hal.executable.export public @avoid_unit_range_distribute ordinal(0) layout(#pipeline_layout) attributes {subgroup_size = 32 : index, translation_info = #iree_codegen.translation_info, workgroup_size = [32 : index, 1 : index, 1 : index]} { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2 hal.return %x, %y, %z : index, index, index @@ -2538,12 +2615,12 @@ hal.executable private @avoid_unit_range_distribute { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = hal.interface.constant.load[4] : i32 - %5 = hal.interface.constant.load[5] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 + %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32 %6 = arith.extui %0 : i32 to i64 %7 = arith.extui %1 : i32 to i64 %8 = arith.shli %7, %c32_i64 : i64 @@ -2561,9 +2638,9 @@ hal.executable private @avoid_unit_range_distribute { %20 = arith.index_castui %19 : i64 to index %21 = flow.dispatch.workload.ordinal %15, 0 : index %22 = flow.dispatch.workload.ordinal %20, 1 : index - %23 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%21, %22} - %24 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%10) flags(ReadOnly) : !flow.dispatch.tensor>{%22} - %25 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%22} + %23 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%21, %22} + %24 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%10) flags(ReadOnly) : !flow.dispatch.tensor>{%22} + %25 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%22} %26 = flow.dispatch.tensor.load %23, offsets = [0, 0, 0, 0, 0], sizes = [32, %21, %22, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor>{%21, %22} -> tensor<32x?x?x16x16xf16> %27 = flow.dispatch.tensor.load %24, offsets = [0, 0, 0, 0, 0], sizes = [32, %22, 8, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor>{%22} -> tensor<32x?x8x16x16xf16> %28 = tensor.empty(%22) : tensor<32x?x16x8x16xf16> @@ -2597,9 +2674,16 @@ hal.executable private @avoid_unit_range_distribute { // Check that the distribution avoids distributing unit-trip count loops. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> hal.executable private @set_size_to_tilesize_when_divisible { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @set_size_to_tilesize_when_divisible ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { + hal.executable.export public @set_size_to_tilesize_when_divisible ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): %x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg1, %arg2, %arg3 hal.return %x, %y, %z : index, index, index @@ -2609,12 +2693,12 @@ hal.executable private @set_size_to_tilesize_when_divisible { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = hal.interface.constant.load[4] : i32 - %5 = hal.interface.constant.load[5] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 + %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32 %6 = arith.extui %0 : i32 to i64 %7 = arith.extui %1 : i32 to i64 %8 = arith.shli %7, %c32_i64 : i64 @@ -2631,10 +2715,10 @@ hal.executable private @set_size_to_tilesize_when_divisible { %19 = arith.ori %16, %18 : i64 %20 = arith.index_castui %19 : i64 to index %21 = flow.dispatch.workload.ordinal %20, 1 : index - %22 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %22 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> %23 = flow.dispatch.workload.ordinal %21, 2 : index - %24 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%21} - %25 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%10) : !flow.dispatch.tensor>{%23} + %24 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%21} + %25 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%10) : !flow.dispatch.tensor>{%23} %26 = flow.dispatch.workload.ordinal %15, 0 : index %27 = flow.dispatch.tensor.load %24, offsets = [0, 0, 0, 0], sizes = [%21, 16, 32, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%21} -> tensor %28 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x32x128xf16> @@ -2686,11 +2770,11 @@ hal.executable private @reshape_matmul_tensors { builtin.module { func.func @reshape_matmul() attributes {translation_info = #translation} { %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 2, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x2x256xf32> diff --git a/compiler/src/iree/compiler/Codegen/Common/test/type_propagation.mlir b/compiler/src/iree/compiler/Codegen/Common/test/type_propagation.mlir index b6edcb72ef4f9..982c6ab3f8dcc 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/type_propagation.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/type_propagation.mlir @@ -1,9 +1,15 @@ // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-type-propagation))" --split-input-file %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @generic_op_illegal_operand() { - %d = hal.interface.constant.load[0] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%d} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%d} + %d = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%d} + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%d} %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes=[%d], strides=[1] : !flow.dispatch.tensor>{%d} -> tensor %3 = arith.trunci %2 : tensor to tensor %4 = tensor.empty(%d) : tensor @@ -19,8 +25,8 @@ func.func @generic_op_illegal_operand() { return } // CHECK-LABEL: func.func @generic_op_illegal_operand() -// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[INTENSOR:.+]] = flow.dispatch.tensor.load %[[IN]] // CHECK-DAG: %[[INIT:.+]] = tensor.empty(%{{.+}}) : tensor // CHECK: %[[GENERIC:.+]] = linalg.generic @@ -34,10 +40,16 @@ func.func @generic_op_illegal_operand() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @generic_op_illegal_operand_i7() { - %d = hal.interface.constant.load[0] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%d} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%d} + %d = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%d} + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%d} %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes=[%d], strides=[1] : !flow.dispatch.tensor>{%d} -> tensor %3 = arith.trunci %2 : tensor to tensor %4 = tensor.empty(%d) : tensor @@ -53,8 +65,8 @@ func.func @generic_op_illegal_operand_i7() { return } // CHECK-LABEL: func.func @generic_op_illegal_operand_i7() -// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[INTENSOR:.+]] = flow.dispatch.tensor.load %[[IN]] // CHECK-DAG: %[[INIT:.+]] = tensor.empty(%{{.+}}) : tensor // CHECK: %[[GENERIC:.+]] = linalg.generic @@ -68,10 +80,16 @@ func.func @generic_op_illegal_operand_i7() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @generic_op_illegal_operand_i33() { - %d = hal.interface.constant.load[0] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%d} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%d} + %d = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%d} + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%d} %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes=[%d], strides=[1] : !flow.dispatch.tensor>{%d} -> tensor %3 = arith.trunci %2 : tensor to tensor %4 = tensor.empty(%d) : tensor @@ -87,8 +105,8 @@ func.func @generic_op_illegal_operand_i33() { return } // CHECK-LABEL: func.func @generic_op_illegal_operand_i33() -// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[INTENSOR:.+]] = flow.dispatch.tensor.load %[[IN]] // CHECK-DAG: %[[INIT:.+]] = tensor.empty(%{{.+}}) : tensor // CHECK: %[[GENERIC:.+]] = linalg.generic @@ -100,13 +118,18 @@ func.func @generic_op_illegal_operand_i33() { // CHECK: linalg.yield %[[EXTUI]] // CHECK: flow.dispatch.tensor.store %[[GENERIC]], %[[OUT]] - // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @generic_op_illegal_result() { - %d = hal.interface.constant.load[0] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%d} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%d} + %d = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%d} + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%d} %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes=[%d], strides=[1] : !flow.dispatch.tensor>{%d} -> tensor %3 = tensor.empty(%d) : tensor %4 = linalg.generic { @@ -122,8 +145,8 @@ func.func @generic_op_illegal_result() { return } // CHECK-LABEL: func.func @generic_op_illegal_result() -// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[INTENSOR:.+]] = flow.dispatch.tensor.load %[[IN]] // CHECK-DAG: %[[INIT:.+]] = tensor.empty(%{{.+}}) : tensor // CHECK: %[[GENERIC:.+]] = linalg.generic @@ -137,12 +160,18 @@ func.func @generic_op_illegal_result() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @tensor_extract() { - %d = hal.interface.constant.load[0] : index - %offset = hal.interface.constant.load[1] : index - %size = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%d} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%d} + %d = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %offset = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %size = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%d} + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%d} %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes=[%d], strides=[1] : !flow.dispatch.tensor>{%d} -> tensor %3 = tensor.extract_slice %2[%offset] [%size] [1] : tensor to tensor %4 = arith.trunci %3 : tensor to tensor @@ -151,21 +180,28 @@ func.func @tensor_extract() { return } // CHECK-LABEL: func.func @tensor_extract() -// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[INTENSOR:.+]] = flow.dispatch.tensor.load %[[IN]] // CHECK: %[[EXTRACT:.+]] = tensor.extract_slice %[[INTENSOR]] // CHECK: flow.dispatch.tensor.store %[[EXTRACT]], %[[OUT]] // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @tensor_insert() { - %d = hal.interface.constant.load[0] : index - %offset = hal.interface.constant.load[1] : index - %size = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%d} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%d} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%d} + %d = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %offset = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %size = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%d} + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%d} + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%d} %3 = flow.dispatch.tensor.load %0, offsets = [%offset], sizes=[%size], strides=[1] : !flow.dispatch.tensor>{%d} -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes=[%d], strides=[1] : !flow.dispatch.tensor>{%d} -> tensor %5 = arith.trunci %3 : tensor to tensor @@ -176,9 +212,9 @@ func.func @tensor_insert() { return } // CHECK-LABEL: func.func @tensor_insert() -// CHECK-DAG: %[[IN1:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[IN2:.+]] = hal.interface.binding.subspan set(0) binding(1) -// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK-DAG: %[[IN1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[IN2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-DAG: %[[IN1TENSOR:.+]] = flow.dispatch.tensor.load %[[IN1]] // CHECK-DAG: %[[IN2TENSOR:.+]] = flow.dispatch.tensor.load %[[IN2]] // CHECK: %[[INSERT:.+]] = tensor.insert_slice %[[IN1TENSOR]] into %[[IN2TENSOR]] @@ -186,12 +222,18 @@ func.func @tensor_insert() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @for_loop() { - %d = hal.interface.constant.load[0] : index - %lb = hal.interface.constant.load[1] : index - %step = hal.interface.constant.load[2] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%d} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%d} + %d = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %lb = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %step = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%d} + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%d} %2 = flow.dispatch.tensor.load %0, offsets=[0], sizes=[%d], strides=[1] : !flow.dispatch.tensor>{%d} -> tensor %3 = flow.dispatch.tensor.load %1, offsets=[0], sizes=[%d], strides=[1] : !flow.dispatch.tensor>{%d} -> tensor %4 = arith.trunci %2 : tensor to tensor @@ -207,8 +249,8 @@ func.func @for_loop() { return } // CHECK-LABEL: func.func @for_loop() -// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[INTENSOR:.+]] = flow.dispatch.tensor.load %[[IN]] // CHECK-DAG: %[[OUTTENSOR:.+]] = flow.dispatch.tensor.load %[[OUT]] // CHECK: %[[FOR:.+]] = scf.for @@ -220,9 +262,14 @@ func.func @for_loop() { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @fill_op() { - %d = hal.interface.constant.load[0] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%d} + %d = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%d} %1 = tensor.empty(%d) : tensor %false = arith.constant false %2 = linalg.fill ins(%false : i1) outs(%1 : tensor) -> tensor @@ -231,7 +278,7 @@ func.func @fill_op() { return } // CHECK-LABEL: func.func @fill_op() -// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-DAG: %[[INIT:.+]] = tensor.empty // CHECK-DAG: %[[FALSE:.+]] = arith.constant false // CHECK-DAG: %[[EXT_SCALAR:.+]] = arith.extui %[[FALSE]] @@ -242,11 +289,16 @@ func.func @fill_op() { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> #map = affine_map<(d0) -> (d0)> func.func @constant_op() { - %a = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %b = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %c = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> + %a = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %b = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %c = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> %at = flow.dispatch.tensor.load %a, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor> -> tensor<4xi32> %bt = flow.dispatch.tensor.load %b, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor> -> tensor<4xi32> %select = arith.constant dense<[true, false, true, false]> : tensor<4xi1> @@ -274,11 +326,16 @@ func.func @constant_op() { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> #map = affine_map<(d0) -> (d0)> func.func @constant_splat_op() { - %a = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %b = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %c = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> + %a = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %b = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %c = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> %at = flow.dispatch.tensor.load %a, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor> -> tensor<4xi32> %bt = flow.dispatch.tensor.load %b, offsets = [0], sizes = [4], strides = [1] : !flow.dispatch.tensor> -> tensor<4xi32> %select = arith.constant dense : tensor<4xi1> @@ -300,12 +357,18 @@ func.func @constant_splat_op() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @tensor_extract() { %c0 = arith.constant 0 : index %c13 = arith.constant 13 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [14], strides = [1] : !flow.dispatch.tensor> -> tensor<14xi8> @@ -326,7 +389,7 @@ func.func @tensor_extract() { return } // CHECK-LABEL: func @tensor_extract() -// CHECK: %[[BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor> // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[BINDING]] // CHECK: %[[EXTRACTED:.+]] = tensor.extract %[[LOAD]] @@ -362,11 +425,18 @@ func.func @named_op(%arg0 : tensor, %arg1 : tensor) -> tensor, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @scatter() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [8], strides = [1] : !flow.dispatch.tensor> -> tensor<8xi8> %4 = arith.trunci %3 : tensor<8xi8> to tensor<8xi1> %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [8, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<8x1xi32> @@ -383,9 +453,9 @@ func.func @scatter() { } // CHECK-LABEL: func.func @scatter() -// CHECK-DAG: %[[UPDATES:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[INDICES:.+]] = hal.interface.binding.subspan set(0) binding(1) -// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK-DAG: %[[UPDATES:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[INDICES:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-DAG: %[[UPDATES_TENSOR:.+]] = flow.dispatch.tensor.load %[[UPDATES]] // CHECK-DAG: %[[INDICES_TENSOR:.+]] = flow.dispatch.tensor.load %[[INDICES]] // CHECK-DAG: %[[OUT_TENSOR:.+]] = flow.dispatch.tensor.load %[[OUT]] @@ -402,10 +472,16 @@ func.func @scatter() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @sort() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor> -> tensor<1xi8> %3 = arith.trunci %2 : tensor<1xi8> to tensor<1xi1> %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor> -> tensor<1xi32> @@ -420,8 +496,8 @@ func.func @sort() { // CHECK-LABEL: func.func @sort() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[A:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[B:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[A:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[B:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[A_TENSOR:.+]] = flow.dispatch.tensor.load %[[A]] // CHECK-DAG: %[[B_TENSOR:.+]] = flow.dispatch.tensor.load %[[B]] // CHECK: %[[SORT:.+]]:2 = iree_linalg_ext.sort dimension(0) @@ -433,13 +509,18 @@ func.func @sort() { // CHECK: iree_linalg_ext.yield %[[CMPI]] // CHECK: flow.dispatch.tensor.store %[[SORT]]#1, %[[B]] - // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @sort_secondary() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor> -> tensor<1xi32> %3 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor> -> tensor<1xi8> %4 = arith.trunci %3 : tensor<1xi8> to tensor<1xi1> @@ -455,8 +536,8 @@ func.func @sort_secondary() { // CHECK-LABEL: func.func @sort_secondary() // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[A:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[B:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[A:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[B:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[A_TENSOR:.+]] = flow.dispatch.tensor.load %[[A]] // CHECK-DAG: %[[B_TENSOR:.+]] = flow.dispatch.tensor.load %[[B]] // CHECK: %[[SORT:.+]]:2 = iree_linalg_ext.sort dimension(0) @@ -468,11 +549,17 @@ func.func @sort_secondary() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @branch_op() { - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %3 = hal.interface.constant.load[0] : i8 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i8 %4 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor %5 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor %6 = arith.trunci %3 : i8 to i1 diff --git a/compiler/src/iree/compiler/Codegen/Common/test/type_propagation_packing.mlir b/compiler/src/iree/compiler/Codegen/Common/test/type_propagation_packing.mlir index 2ef0bc945bb45..367a1fcad4954 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/type_propagation_packing.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/type_propagation_packing.mlir @@ -1,9 +1,15 @@ // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-type-propagation))" %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @generic_op_i4() { - %d = hal.interface.constant.load[0] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%d} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%d} + %d = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%d} + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%d} %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes=[%d], strides=[1] : !flow.dispatch.tensor>{%d} -> tensor %4 = tensor.empty(%d) : tensor %5 = linalg.generic { @@ -19,8 +25,8 @@ func.func @generic_op_i4() { } // CHECK-LABEL: func.func @generic_op_i4() -// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[IN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[INTENSOR:.+]] = flow.dispatch.tensor.load %[[IN]]{{.+}} -> tensor // CHECK-DAG: %[[INIT:.+]] = tensor.empty(%{{.+}}) : tensor // CHECK: %[[GENERIC:.+]] = linalg.generic diff --git a/compiler/src/iree/compiler/Codegen/Common/test/workgroup_specialization.mlir b/compiler/src/iree/compiler/Codegen/Common/test/workgroup_specialization.mlir index 5f64013aa7829..1654fce2b5d58 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/workgroup_specialization.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/workgroup_specialization.mlir @@ -1,14 +1,21 @@ // RUN: iree-opt --iree-codegen-enable-workgroup-specialization --pass-pipeline="builtin.module(func.func(iree-codegen-workgroup-specialization),canonicalize,cse)" --split-input-file %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 64)> #map1 = affine_map<()[s0] -> (s0 * -64 + 123, 64)> #map2 = affine_map<()[s0] -> (s0 * -64 + 789, 64)> func.func @matmul_tensors() { %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %3 = affine.apply #map()[%workgroup_id_y] @@ -24,7 +31,6 @@ func.func @matmul_tensors() { return } - // CHECK: func.func @matmul_tensors() // CHECK: %[[C64:.+]] = arith.constant 64 : index // CHECK: %[[CMP0:.+]] = arith.cmpi eq, %{{.+}}, %[[C64]] : index @@ -39,6 +45,13 @@ func.func @matmul_tensors() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 64)> #map1 = affine_map<()[s0] -> (s0 * -64 + 123, 64)> @@ -46,9 +59,9 @@ func.func @matmul_tensors() { #map3 = affine_map<(d0, d1) -> (d0, d1)> func.func @add_tensors() { %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %3 = affine.apply #map()[%workgroup_id_y] @@ -82,6 +95,13 @@ func.func @add_tensors() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 2)> #map1 = affine_map<()[s0] -> (s0 * 256)> @@ -94,10 +114,10 @@ func.func @unaligned_partial_loop() { %c265458176 = arith.constant 265458176 : index %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c512) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c786944) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c265458176) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c512) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c786944) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c265458176) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %4 = affine.apply #map()[%workgroup_id_y] diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/fuse_and_hoist_forall.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/fuse_and_hoist_forall.mlir index 2ee4d2c00a28d..e889d821271b8 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/fuse_and_hoist_forall.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/fuse_and_hoist_forall.mlir @@ -1,64 +1,69 @@ // RUN: iree-opt %s --pass-pipeline='builtin.module(func.func(iree-gpu-fuse-and-hoist-parallel-loops))' --split-input-file | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0) -> (d0 * 2)> #map1 = affine_map<(d0) -> (d0 * 4)> #map2 = affine_map<(d0)[s0] -> (d0 * 4 + s0)> #map3 = affine_map<(d0)[s0] -> (d0 * 2 + s0)> #map4 = affine_map<(d0) -> (d0 * 16)> -module { - func.func @forall_fuse_then_hoist() { - %c4 = arith.constant 4 : index - %c128 = arith.constant 128 : index - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x128xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x128xf16> - %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x128xf32> - %6 = tensor.empty() : tensor<128x4xf16> - %7 = tensor.empty() : tensor<4x128xf16> - %8 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) { - %9 = scf.forall (%arg2, %arg3) in (64, 1) shared_outs(%arg4 = %6) -> (tensor<128x4xf16>) { - %12 = affine.apply #map(%arg2) - %13 = affine.apply #map1(%arg3) - %14 = affine.apply #map(%arg2) - %15 = affine.apply #map2(%arg3)[%arg0] - %extracted_slice = tensor.extract_slice %3[%14, %15] [2, 4] [1, 1] : tensor<128x128xf16> to tensor<2x4xf16> - %extracted_slice_0 = tensor.extract_slice %arg4[%12, %13] [2, 4] [1, 1] : tensor<128x4xf16> to tensor<2x4xf16> - %16 = linalg.copy ins(%extracted_slice : tensor<2x4xf16>) outs(%extracted_slice_0 : tensor<2x4xf16>) -> tensor<2x4xf16> - scf.forall.in_parallel { - tensor.parallel_insert_slice %16 into %arg4[%12, %13] [2, 4] [1, 1] : tensor<2x4xf16> into tensor<128x4xf16> - } - } {mapping = [#gpu.thread, #gpu.thread]} - %10 = scf.forall (%arg2, %arg3) in (2, 32) shared_outs(%arg4 = %7) -> (tensor<4x128xf16>) { - %12 = affine.apply #map(%arg2) - %13 = affine.apply #map1(%arg3) - %14 = affine.apply #map3(%arg2)[%arg0] - %15 = affine.apply #map1(%arg3) - %extracted_slice = tensor.extract_slice %4[%14, %15] [2, 4] [1, 1] : tensor<128x128xf16> to tensor<2x4xf16> - %extracted_slice_0 = tensor.extract_slice %arg4[%12, %13] [2, 4] [1, 1] : tensor<4x128xf16> to tensor<2x4xf16> - %16 = linalg.copy ins(%extracted_slice : tensor<2x4xf16>) outs(%extracted_slice_0 : tensor<2x4xf16>) -> tensor<2x4xf16> - scf.forall.in_parallel { - tensor.parallel_insert_slice %16 into %arg4[%12, %13] [2, 4] [1, 1] : tensor<2x4xf16> into tensor<4x128xf16> - } - } {mapping = [#gpu.thread, #gpu.thread]} - %11 = scf.forall (%arg2, %arg3) in (8, 8) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf32>) { - %12 = affine.apply #map4(%arg2) - %13 = affine.apply #map4(%arg3) - %extracted_slice = tensor.extract_slice %9[%12, 0] [16, 4] [1, 1] : tensor<128x4xf16> to tensor<16x4xf16> - %extracted_slice_0 = tensor.extract_slice %10[0, %13] [4, 16] [1, 1] : tensor<4x128xf16> to tensor<4x16xf16> - %extracted_slice_1 = tensor.extract_slice %arg4[%12, %13] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32> - %14 = linalg.matmul ins(%extracted_slice, %extracted_slice_0 : tensor<16x4xf16>, tensor<4x16xf16>) outs(%extracted_slice_1 : tensor<16x16xf32>) -> tensor<16x16xf32> - scf.forall.in_parallel { - tensor.parallel_insert_slice %14 into %arg4[%12, %13] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32> - } - } {mapping = [#gpu.thread, #gpu.thread]} - scf.yield %11 : tensor<128x128xf32> - } - flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor> - return +func.func @forall_fuse_then_hoist() { + %c4 = arith.constant 4 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x128xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x128xf16> + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x128xf32> + %6 = tensor.empty() : tensor<128x4xf16> + %7 = tensor.empty() : tensor<4x128xf16> + %8 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) { + %9 = scf.forall (%arg2, %arg3) in (64, 1) shared_outs(%arg4 = %6) -> (tensor<128x4xf16>) { + %12 = affine.apply #map(%arg2) + %13 = affine.apply #map1(%arg3) + %14 = affine.apply #map(%arg2) + %15 = affine.apply #map2(%arg3)[%arg0] + %extracted_slice = tensor.extract_slice %3[%14, %15] [2, 4] [1, 1] : tensor<128x128xf16> to tensor<2x4xf16> + %extracted_slice_0 = tensor.extract_slice %arg4[%12, %13] [2, 4] [1, 1] : tensor<128x4xf16> to tensor<2x4xf16> + %16 = linalg.copy ins(%extracted_slice : tensor<2x4xf16>) outs(%extracted_slice_0 : tensor<2x4xf16>) -> tensor<2x4xf16> + scf.forall.in_parallel { + tensor.parallel_insert_slice %16 into %arg4[%12, %13] [2, 4] [1, 1] : tensor<2x4xf16> into tensor<128x4xf16> + } + } {mapping = [#gpu.thread, #gpu.thread]} + %10 = scf.forall (%arg2, %arg3) in (2, 32) shared_outs(%arg4 = %7) -> (tensor<4x128xf16>) { + %12 = affine.apply #map(%arg2) + %13 = affine.apply #map1(%arg3) + %14 = affine.apply #map3(%arg2)[%arg0] + %15 = affine.apply #map1(%arg3) + %extracted_slice = tensor.extract_slice %4[%14, %15] [2, 4] [1, 1] : tensor<128x128xf16> to tensor<2x4xf16> + %extracted_slice_0 = tensor.extract_slice %arg4[%12, %13] [2, 4] [1, 1] : tensor<4x128xf16> to tensor<2x4xf16> + %16 = linalg.copy ins(%extracted_slice : tensor<2x4xf16>) outs(%extracted_slice_0 : tensor<2x4xf16>) -> tensor<2x4xf16> + scf.forall.in_parallel { + tensor.parallel_insert_slice %16 into %arg4[%12, %13] [2, 4] [1, 1] : tensor<2x4xf16> into tensor<4x128xf16> + } + } {mapping = [#gpu.thread, #gpu.thread]} + %11 = scf.forall (%arg2, %arg3) in (8, 8) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf32>) { + %12 = affine.apply #map4(%arg2) + %13 = affine.apply #map4(%arg3) + %extracted_slice = tensor.extract_slice %9[%12, 0] [16, 4] [1, 1] : tensor<128x4xf16> to tensor<16x4xf16> + %extracted_slice_0 = tensor.extract_slice %10[0, %13] [4, 16] [1, 1] : tensor<4x128xf16> to tensor<4x16xf16> + %extracted_slice_1 = tensor.extract_slice %arg4[%12, %13] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32> + %14 = linalg.matmul ins(%extracted_slice, %extracted_slice_0 : tensor<16x4xf16>, tensor<4x16xf16>) outs(%extracted_slice_1 : tensor<16x16xf32>) -> tensor<16x16xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %14 into %arg4[%12, %13] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32> + } + } {mapping = [#gpu.thread, #gpu.thread]} + scf.yield %11 : tensor<128x128xf32> } + flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func @forall_fuse_then_hoist @@ -71,51 +76,56 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0) -> (d0 * 2)> #map1 = affine_map<(d0) -> (d0 * 4)> #map2 = affine_map<(d0)[s0] -> (d0 * 4 + s0)> #map3 = affine_map<(d0) -> (d0 * 16)> -module { - func.func @forall_fuse_then_hoist_mixed_mappings() { - %c4 = arith.constant 4 : index - %c128 = arith.constant 128 : index - %c0 = arith.constant 0 : index - %cst = arith.constant dense<0.0> : tensor<4x128xf16> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x128xf16> - %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x128xf32> - %6 = tensor.empty() : tensor<128x4xf16> - %7 = tensor.empty() : tensor<4x128xf16> - %8 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) { - %9 = scf.forall (%arg2, %arg3, %arg4) in (1, 64, 1) shared_outs(%arg5 = %6) -> (tensor<128x4xf16>) { - %12 = affine.apply #map(%arg3) - %13 = affine.apply #map1(%arg4) - %14 = affine.apply #map(%arg3) - %15 = affine.apply #map2(%arg4)[%arg0] - %extracted_slice = tensor.extract_slice %3[%14, %15] [2, 4] [1, 1] : tensor<128x128xf16> to tensor<2x4xf16> - %extracted_slice_0 = tensor.extract_slice %arg5[%12, %13] [2, 4] [1, 1] : tensor<128x4xf16> to tensor<2x4xf16> - %16 = linalg.copy ins(%extracted_slice : tensor<2x4xf16>) outs(%extracted_slice_0 : tensor<2x4xf16>) -> tensor<2x4xf16> - scf.forall.in_parallel { - tensor.parallel_insert_slice %16 into %arg5[%12, %13] [2, 4] [1, 1] : tensor<2x4xf16> into tensor<128x4xf16> - } - } {mapping = [#gpu.thread, #gpu.thread, #gpu.thread]} - %11 = scf.forall (%arg2, %arg3) in (8, 8) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf32>) { - %12 = affine.apply #map3(%arg2) - %13 = affine.apply #map3(%arg3) - %extracted_slice = tensor.extract_slice %9[%12, 0] [16, 4] [1, 1] : tensor<128x4xf16> to tensor<16x4xf16> - %extracted_slice_0 = tensor.extract_slice %cst[0, %13] [4, 16] [1, 1] : tensor<4x128xf16> to tensor<4x16xf16> - %extracted_slice_1 = tensor.extract_slice %arg4[%12, %13] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32> - %14 = linalg.matmul ins(%extracted_slice, %extracted_slice_0 : tensor<16x4xf16>, tensor<4x16xf16>) outs(%extracted_slice_1 : tensor<16x16xf32>) -> tensor<16x16xf32> - scf.forall.in_parallel { - tensor.parallel_insert_slice %14 into %arg4[%12, %13] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32> - } - } {mapping = [#gpu.thread, #gpu.thread]} - scf.yield %11 : tensor<128x128xf32> - } - flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor> - return +func.func @forall_fuse_then_hoist_mixed_mappings() { + %c4 = arith.constant 4 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %cst = arith.constant dense<0.0> : tensor<4x128xf16> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x128xf16> + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x128xf32> + %6 = tensor.empty() : tensor<128x4xf16> + %7 = tensor.empty() : tensor<4x128xf16> + %8 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) { + %9 = scf.forall (%arg2, %arg3, %arg4) in (1, 64, 1) shared_outs(%arg5 = %6) -> (tensor<128x4xf16>) { + %12 = affine.apply #map(%arg3) + %13 = affine.apply #map1(%arg4) + %14 = affine.apply #map(%arg3) + %15 = affine.apply #map2(%arg4)[%arg0] + %extracted_slice = tensor.extract_slice %3[%14, %15] [2, 4] [1, 1] : tensor<128x128xf16> to tensor<2x4xf16> + %extracted_slice_0 = tensor.extract_slice %arg5[%12, %13] [2, 4] [1, 1] : tensor<128x4xf16> to tensor<2x4xf16> + %16 = linalg.copy ins(%extracted_slice : tensor<2x4xf16>) outs(%extracted_slice_0 : tensor<2x4xf16>) -> tensor<2x4xf16> + scf.forall.in_parallel { + tensor.parallel_insert_slice %16 into %arg5[%12, %13] [2, 4] [1, 1] : tensor<2x4xf16> into tensor<128x4xf16> + } + } {mapping = [#gpu.thread, #gpu.thread, #gpu.thread]} + %11 = scf.forall (%arg2, %arg3) in (8, 8) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf32>) { + %12 = affine.apply #map3(%arg2) + %13 = affine.apply #map3(%arg3) + %extracted_slice = tensor.extract_slice %9[%12, 0] [16, 4] [1, 1] : tensor<128x4xf16> to tensor<16x4xf16> + %extracted_slice_0 = tensor.extract_slice %cst[0, %13] [4, 16] [1, 1] : tensor<4x128xf16> to tensor<4x16xf16> + %extracted_slice_1 = tensor.extract_slice %arg4[%12, %13] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32> + %14 = linalg.matmul ins(%extracted_slice, %extracted_slice_0 : tensor<16x4xf16>, tensor<4x16xf16>) outs(%extracted_slice_1 : tensor<16x16xf32>) -> tensor<16x16xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %14 into %arg4[%12, %13] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32> + } + } {mapping = [#gpu.thread, #gpu.thread]} + scf.yield %11 : tensor<128x128xf32> } + flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func @forall_fuse_then_hoist_mixed_mappings @@ -129,67 +139,72 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0) -> (d0 * 2)> #map1 = affine_map<(d0) -> (d0 * 4)> #map2 = affine_map<(d0)[s0] -> (d0 * 4 + s0)> #map3 = affine_map<(d0)[s0] -> (d0 * 2 + s0)> #map4 = affine_map<(d0) -> (d0 * 16)> -module { - func.func @forall_fuse_then_hoist_with_fill() { - %c4 = arith.constant 4 : index - %c128 = arith.constant 128 : index - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x128xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x128xf16> - %empty = tensor.empty() : tensor<128x128xf32> - %cst = arith.constant 0.0 : f32 - %5 = linalg.fill ins(%cst : f32) outs(%empty : tensor<128x128xf32>) -> tensor<128x128xf32> - %6 = tensor.empty() : tensor<128x4xf16> - %7 = tensor.empty() : tensor<4x128xf16> - %8 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) { - %9 = scf.forall (%arg2, %arg3) in (64, 1) shared_outs(%arg4 = %6) -> (tensor<128x4xf16>) { - %12 = affine.apply #map(%arg2) - %13 = affine.apply #map1(%arg3) - %14 = affine.apply #map(%arg2) - %15 = affine.apply #map2(%arg3)[%arg0] - %extracted_slice = tensor.extract_slice %3[%14, %15] [2, 4] [1, 1] : tensor<128x128xf16> to tensor<2x4xf16> - %extracted_slice_0 = tensor.extract_slice %arg4[%12, %13] [2, 4] [1, 1] : tensor<128x4xf16> to tensor<2x4xf16> - %16 = linalg.copy ins(%extracted_slice : tensor<2x4xf16>) outs(%extracted_slice_0 : tensor<2x4xf16>) -> tensor<2x4xf16> - scf.forall.in_parallel { - tensor.parallel_insert_slice %16 into %arg4[%12, %13] [2, 4] [1, 1] : tensor<2x4xf16> into tensor<128x4xf16> - } - } {mapping = [#gpu.thread, #gpu.thread]} - %10 = scf.forall (%arg2, %arg3) in (2, 32) shared_outs(%arg4 = %7) -> (tensor<4x128xf16>) { - %12 = affine.apply #map(%arg2) - %13 = affine.apply #map1(%arg3) - %14 = affine.apply #map3(%arg2)[%arg0] - %15 = affine.apply #map1(%arg3) - %extracted_slice = tensor.extract_slice %4[%14, %15] [2, 4] [1, 1] : tensor<128x128xf16> to tensor<2x4xf16> - %extracted_slice_0 = tensor.extract_slice %arg4[%12, %13] [2, 4] [1, 1] : tensor<4x128xf16> to tensor<2x4xf16> - %16 = linalg.copy ins(%extracted_slice : tensor<2x4xf16>) outs(%extracted_slice_0 : tensor<2x4xf16>) -> tensor<2x4xf16> - scf.forall.in_parallel { - tensor.parallel_insert_slice %16 into %arg4[%12, %13] [2, 4] [1, 1] : tensor<2x4xf16> into tensor<4x128xf16> - } - } {mapping = [#gpu.thread, #gpu.thread]} - %11 = scf.forall (%arg2, %arg3) in (8, 8) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf32>) { - %12 = affine.apply #map4(%arg2) - %13 = affine.apply #map4(%arg3) - %extracted_slice = tensor.extract_slice %9[%12, 0] [16, 4] [1, 1] : tensor<128x4xf16> to tensor<16x4xf16> - %extracted_slice_0 = tensor.extract_slice %10[0, %13] [4, 16] [1, 1] : tensor<4x128xf16> to tensor<4x16xf16> - %extracted_slice_1 = tensor.extract_slice %arg4[%12, %13] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32> - %14 = linalg.matmul ins(%extracted_slice, %extracted_slice_0 : tensor<16x4xf16>, tensor<4x16xf16>) outs(%extracted_slice_1 : tensor<16x16xf32>) -> tensor<16x16xf32> - scf.forall.in_parallel { - tensor.parallel_insert_slice %14 into %arg4[%12, %13] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32> - } - } {mapping = [#gpu.thread, #gpu.thread]} - scf.yield %11 : tensor<128x128xf32> - } - flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor> - return +func.func @forall_fuse_then_hoist_with_fill() { + %c4 = arith.constant 4 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x128xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x128xf16> + %empty = tensor.empty() : tensor<128x128xf32> + %cst = arith.constant 0.0 : f32 + %5 = linalg.fill ins(%cst : f32) outs(%empty : tensor<128x128xf32>) -> tensor<128x128xf32> + %6 = tensor.empty() : tensor<128x4xf16> + %7 = tensor.empty() : tensor<4x128xf16> + %8 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %5) -> (tensor<128x128xf32>) { + %9 = scf.forall (%arg2, %arg3) in (64, 1) shared_outs(%arg4 = %6) -> (tensor<128x4xf16>) { + %12 = affine.apply #map(%arg2) + %13 = affine.apply #map1(%arg3) + %14 = affine.apply #map(%arg2) + %15 = affine.apply #map2(%arg3)[%arg0] + %extracted_slice = tensor.extract_slice %3[%14, %15] [2, 4] [1, 1] : tensor<128x128xf16> to tensor<2x4xf16> + %extracted_slice_0 = tensor.extract_slice %arg4[%12, %13] [2, 4] [1, 1] : tensor<128x4xf16> to tensor<2x4xf16> + %16 = linalg.copy ins(%extracted_slice : tensor<2x4xf16>) outs(%extracted_slice_0 : tensor<2x4xf16>) -> tensor<2x4xf16> + scf.forall.in_parallel { + tensor.parallel_insert_slice %16 into %arg4[%12, %13] [2, 4] [1, 1] : tensor<2x4xf16> into tensor<128x4xf16> + } + } {mapping = [#gpu.thread, #gpu.thread]} + %10 = scf.forall (%arg2, %arg3) in (2, 32) shared_outs(%arg4 = %7) -> (tensor<4x128xf16>) { + %12 = affine.apply #map(%arg2) + %13 = affine.apply #map1(%arg3) + %14 = affine.apply #map3(%arg2)[%arg0] + %15 = affine.apply #map1(%arg3) + %extracted_slice = tensor.extract_slice %4[%14, %15] [2, 4] [1, 1] : tensor<128x128xf16> to tensor<2x4xf16> + %extracted_slice_0 = tensor.extract_slice %arg4[%12, %13] [2, 4] [1, 1] : tensor<4x128xf16> to tensor<2x4xf16> + %16 = linalg.copy ins(%extracted_slice : tensor<2x4xf16>) outs(%extracted_slice_0 : tensor<2x4xf16>) -> tensor<2x4xf16> + scf.forall.in_parallel { + tensor.parallel_insert_slice %16 into %arg4[%12, %13] [2, 4] [1, 1] : tensor<2x4xf16> into tensor<4x128xf16> + } + } {mapping = [#gpu.thread, #gpu.thread]} + %11 = scf.forall (%arg2, %arg3) in (8, 8) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf32>) { + %12 = affine.apply #map4(%arg2) + %13 = affine.apply #map4(%arg3) + %extracted_slice = tensor.extract_slice %9[%12, 0] [16, 4] [1, 1] : tensor<128x4xf16> to tensor<16x4xf16> + %extracted_slice_0 = tensor.extract_slice %10[0, %13] [4, 16] [1, 1] : tensor<4x128xf16> to tensor<4x16xf16> + %extracted_slice_1 = tensor.extract_slice %arg4[%12, %13] [16, 16] [1, 1] : tensor<128x128xf32> to tensor<16x16xf32> + %14 = linalg.matmul ins(%extracted_slice, %extracted_slice_0 : tensor<16x4xf16>, tensor<4x16xf16>) outs(%extracted_slice_1 : tensor<16x16xf32>) -> tensor<16x16xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %14 into %arg4[%12, %13] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<128x128xf32> + } + } {mapping = [#gpu.thread, #gpu.thread]} + scf.yield %11 : tensor<128x128xf32> } + flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func @forall_fuse_then_hoist_with_fill @@ -203,37 +218,42 @@ module { // ----- -module { - func.func @multi_hoist_and_fuse_trailing_stuff() { - %c4 = arith.constant 4 : index - %c128 = arith.constant 128 : index - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x128xf16> - %empty = tensor.empty() : tensor<128x128xf16> - %8 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %empty) -> (tensor<128x128xf16>) { - %9 = scf.forall (%arg2, %arg3) in (2, 2) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf16>) { - %extracted_slice = tensor.extract_slice %arg4[%arg2, %arg3] [64, 64] [1, 1] : tensor<128x128xf16> to tensor<64x64xf16> - %10 = scf.forall (%arg5, %arg6) in (32, 16) shared_outs(%arg7 = %extracted_slice) -> (tensor<64x64xf16>) { - %extracted_slice_1 = tensor.extract_slice %2[%arg5, %arg6] [2, 4] [1, 1] : tensor<128x128xf16> to tensor<2x4xf16> - %extracted_slice_2 = tensor.extract_slice %arg7[%arg5, %arg6] [2, 4] [1, 1] : tensor<64x64xf16> to tensor<2x4xf16> - %16 = linalg.copy ins(%extracted_slice_1 : tensor<2x4xf16>) outs(%extracted_slice_2 : tensor<2x4xf16>) -> tensor<2x4xf16> - scf.forall.in_parallel { - tensor.parallel_insert_slice %16 into %arg7[%arg5, %arg6] [2, 4] [1, 1] : tensor<2x4xf16> into tensor<64x64xf16> - } - } {mapping = [#gpu.thread, #gpu.thread]} +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @multi_hoist_and_fuse_trailing_stuff() { + %c4 = arith.constant 4 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x128xf16> + %empty = tensor.empty() : tensor<128x128xf16> + %8 = scf.for %arg0 = %c0 to %c128 step %c4 iter_args(%arg1 = %empty) -> (tensor<128x128xf16>) { + %9 = scf.forall (%arg2, %arg3) in (2, 2) shared_outs(%arg4 = %arg1) -> (tensor<128x128xf16>) { + %extracted_slice = tensor.extract_slice %arg4[%arg2, %arg3] [64, 64] [1, 1] : tensor<128x128xf16> to tensor<64x64xf16> + %10 = scf.forall (%arg5, %arg6) in (32, 16) shared_outs(%arg7 = %extracted_slice) -> (tensor<64x64xf16>) { + %extracted_slice_1 = tensor.extract_slice %2[%arg5, %arg6] [2, 4] [1, 1] : tensor<128x128xf16> to tensor<2x4xf16> + %extracted_slice_2 = tensor.extract_slice %arg7[%arg5, %arg6] [2, 4] [1, 1] : tensor<64x64xf16> to tensor<2x4xf16> + %16 = linalg.copy ins(%extracted_slice_1 : tensor<2x4xf16>) outs(%extracted_slice_2 : tensor<2x4xf16>) -> tensor<2x4xf16> scf.forall.in_parallel { - tensor.parallel_insert_slice %10 into %arg4[%arg2, %arg3] [64, 64] [1, 1] : tensor<64x64xf16> into tensor<128x128xf16> + tensor.parallel_insert_slice %16 into %arg7[%arg5, %arg6] [2, 4] [1, 1] : tensor<2x4xf16> into tensor<64x64xf16> } - } {mapping = [#gpu.warp, #gpu.warp]} - scf.yield %9 : tensor<128x128xf16> - } - %transpose = linalg.transpose ins(%8: tensor<128x128xf16>) outs(%empty: tensor<128x128xf16>) permutation = [1, 0] - %ceil = linalg.ceil ins(%transpose: tensor<128x128xf16>) outs(%empty: tensor<128x128xf16>) -> tensor<128x128xf16> - flow.dispatch.tensor.store %ceil, %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf16> -> !flow.dispatch.tensor> - return + } {mapping = [#gpu.thread, #gpu.thread]} + scf.forall.in_parallel { + tensor.parallel_insert_slice %10 into %arg4[%arg2, %arg3] [64, 64] [1, 1] : tensor<64x64xf16> into tensor<128x128xf16> + } + } {mapping = [#gpu.warp, #gpu.warp]} + scf.yield %9 : tensor<128x128xf16> } + %transpose = linalg.transpose ins(%8: tensor<128x128xf16>) outs(%empty: tensor<128x128xf16>) permutation = [1, 0] + %ceil = linalg.ceil ins(%transpose: tensor<128x128xf16>) outs(%empty: tensor<128x128xf16>) -> tensor<128x128xf16> + flow.dispatch.tensor.store %ceil, %1, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xf16> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func @multi_hoist_and_fuse_trailing_stuff diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/aarch64_dotprod_vector_lowering.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/aarch64_dotprod_vector_lowering.mlir index 8be21d58f099b..327ef623bd015 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/aarch64_dotprod_vector_lowering.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/aarch64_dotprod_vector_lowering.mlir @@ -5,6 +5,12 @@ data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android29"}> +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @mmt4d_kernel_dispatch() attributes {hal.executable.target = #target} { %c0_i8 = arith.constant 0 : i8 %cst = arith.constant dense<0> : vector<1x1x8x8xi32> @@ -13,11 +19,11 @@ func.func @mmt4d_kernel_dispatch() attributes {hal.executable.target = #target} %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index %c64 = arith.constant 64 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<1x2x8x4xi8> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<1x2x8x4xi8> memref.assume_alignment %0, 64 : memref<1x2x8x4xi8> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c64) : memref<1x2x8x4xi8> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c64) : memref<1x2x8x4xi8> memref.assume_alignment %1, 64 : memref<1x2x8x4xi8> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c128) : memref<1x1x8x8xi32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c128) : memref<1x1x8x8xi32> memref.assume_alignment %2, 64 : memref<1x1x8x8xi32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/aarch64_vector_lowering.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/aarch64_vector_lowering.mlir index 110d626d31fa1..814138ac52e33 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/aarch64_vector_lowering.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/aarch64_vector_lowering.mlir @@ -2,6 +2,13 @@ // RUN: iree-opt %s --pass-pipeline="builtin.module(func.func(iree-llvmcpu-mmt4d-vector-lowering{vector-contract-custom-kernels=false}))" --split-input-file | FileCheck %s -check-prefix=CHECK-KERNEL-OFF // RUN: iree-opt %s --pass-pipeline="builtin.module(func.func(iree-llvmcpu-mmt4d-vector-lowering{vector-contract-custom-kernels=true}))" --split-input-file | FileCheck %s -check-prefix=CHECK-KERNEL-ON +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map0 = affine_map<()[s0] -> (s0 * 64)> #map1 = affine_map<(d0, d1, d2) -> (d0, d2)> #map2 = affine_map<(d0, d1, d2) -> (d2, d1)> @@ -17,9 +24,9 @@ module { %cst_0 = arith.constant 0.000000e+00 : f32 %c384 = arith.constant 384 : index %c128 = arith.constant 128 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index @@ -64,9 +71,9 @@ module { // CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index // CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index -// CHECK: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> -// CHECK: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> -// CHECK: %[[DST:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> +// CHECK: %[[LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : !flow.dispatch.tensor> +// CHECK: %[[RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : !flow.dispatch.tensor> +// CHECK: %[[DST:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) : !flow.dispatch.tensor> // CHECK: %[[DST_TILE_INIT:.+]] = tensor.empty() // CHECK: scf.for %[[I_IDX:.+]] = {{.*}} to %[[C384]] step %{{[0-9]*}} { // CHECK: %[[LHS_TILE:.+]] = flow.dispatch.tensor.load %[[LHS]], {{.*}} -> tensor<64x512xf32> @@ -88,6 +95,16 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer>, + #hal.descriptor_set.binding<5, storage_buffer> + ]> +]> #map0 = affine_map<()[s0] -> (s0 * 64)> #map1 = affine_map<(d0, d1, d2) -> (d0, d2)> #map2 = affine_map<(d0, d1, d2) -> (d2, d1)> @@ -108,12 +125,12 @@ module { %c1835008 = arith.constant 1835008 : index %c0 = arith.constant 0 : index %c64 = arith.constant 64 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) offset(%c1835008) : !flow.dispatch.tensor> - %5 = hal.interface.binding.subspan set(0) binding(5) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) offset(%c1835008) : !flow.dispatch.tensor> + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(5) : !flow.dispatch.tensor> %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [2, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2x512xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/apply_scale_lowering.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/apply_scale_lowering.mlir index 8077651dce476..20c59336353b8 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/apply_scale_lowering.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/apply_scale_lowering.mlir @@ -27,8 +27,8 @@ hal.executable private @apply_scale_no_vector_feature { %cst = arith.constant dense<19689> : vector<2xi32> %cst_0 = arith.constant dense<15> : vector<2xi8> %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<2xi32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2xi32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<2xi32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<2xi32> %2 = vector.load %0[%c0] : memref<2xi32>, vector<2xi32> %3 = tosa.apply_scale %2, %cst, %cst_0 {double_round = false} : (vector<2xi32>, vector<2xi32>, vector<2xi8>) -> vector<2xi32> vector.store %3, %1[%c0] : memref<2xi32>, vector<2xi32> @@ -75,8 +75,8 @@ hal.executable private @apply_scale_v { %cst = arith.constant dense<19689> : vector<2xi32> %cst_0 = arith.constant dense<15> : vector<2xi8> %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<2xi32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2xi32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<2xi32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<2xi32> %2 = vector.load %0[%c0] : memref<2xi32>, vector<2xi32> %3 = tosa.apply_scale %2, %cst, %cst_0 {double_round = false} : (vector<2xi32>, vector<2xi32>, vector<2xi8>) -> vector<2xi32> vector.store %3, %1[%c0] : memref<2xi32>, vector<2xi32> @@ -121,8 +121,8 @@ hal.executable private @apply_scale_zve64x { %cst = arith.constant dense<19689> : vector<2xi32> %cst_0 = arith.constant dense<15> : vector<2xi8> %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<2xi32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2xi32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<2xi32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<2xi32> %2 = vector.load %0[%c0] : memref<2xi32>, vector<2xi32> %3 = tosa.apply_scale %2, %cst, %cst_0 {double_round = false} : (vector<2xi32>, vector<2xi32>, vector<2xi8>) -> vector<2xi32> vector.store %3, %1[%c0] : memref<2xi32>, vector<2xi32> @@ -167,8 +167,8 @@ hal.executable private @apply_scale_zve32x { %cst = arith.constant dense<19689> : vector<2xi32> %cst_0 = arith.constant dense<15> : vector<2xi8> %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<2xi32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2xi32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<2xi32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<2xi32> %2 = vector.load %0[%c0] : memref<2xi32>, vector<2xi32> %3 = tosa.apply_scale %2, %cst, %cst_0 {double_round = false} : (vector<2xi32>, vector<2xi32>, vector<2xi8>) -> vector<2xi32> vector.store %3, %1[%c0] : memref<2xi32>, vector<2xi32> @@ -220,8 +220,8 @@ hal.executable private @apply_scale_zve32f { %cst = arith.constant dense<19689> : vector<2xi32> %cst_0 = arith.constant dense<15> : vector<2xi8> %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<2xi32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2xi32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<2xi32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<2xi32> %2 = vector.load %0[%c0] : memref<2xi32>, vector<2xi32> %3 = tosa.apply_scale %2, %cst, %cst_0 {double_round = false} : (vector<2xi32>, vector<2xi32>, vector<2xi8>) -> vector<2xi32> vector.store %3, %1[%c0] : memref<2xi32>, vector<2xi32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir index 1d5686e0260d7..6268c4fb6494d 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/convert_to_llvm.mlir @@ -43,33 +43,37 @@ module { // ----- -module { - func.func @interleave_and_bitcast_lowering() { - %cst = arith.constant dense<4> : vector<4x2xi8> - %cst_0 = arith.constant dense<0> : vector<4x4xi4> - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %c3 = arith.constant 3 : index - %c4096 = arith.constant 4096 : index - %c8192 = arith.constant 8192 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c4096) flags(ReadOnly) : memref<128xi8, strided<[1], offset: 4096>> - %out_buffer = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c8192) : memref<256x64xi4, strided<[64, 1], offset: 8192>> - %2 = vector.load %0[%c0] : memref<128xi8, strided<[1], offset: 4096>>, vector<2xi8> - %3 = vector.bitcast %2 : vector<2xi8> to vector<4xi4> - %4 = vector.insert %3, %cst_0 [3] : vector<4xi4> into vector<4x4xi4> - %5 = vector.bitcast %4 : vector<4x4xi4> to vector<4x2xi8> - %6 = arith.shli %5, %cst : vector<4x2xi8> - %7 = arith.shrsi %6, %cst : vector<4x2xi8> - %8 = arith.shrsi %5, %cst : vector<4x2xi8> +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @interleave_and_bitcast_lowering() { + %cst = arith.constant dense<4> : vector<4x2xi8> + %cst_0 = arith.constant dense<0> : vector<4x4xi4> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %c4096 = arith.constant 4096 : index + %c8192 = arith.constant 8192 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c4096) flags(ReadOnly) : memref<128xi8, strided<[1], offset: 4096>> + %out_buffer = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c8192) : memref<256x64xi4, strided<[64, 1], offset: 8192>> + %2 = vector.load %0[%c0] : memref<128xi8, strided<[1], offset: 4096>>, vector<2xi8> + %3 = vector.bitcast %2 : vector<2xi8> to vector<4xi4> + %4 = vector.insert %3, %cst_0 [3] : vector<4xi4> into vector<4x4xi4> + %5 = vector.bitcast %4 : vector<4x4xi4> to vector<4x2xi8> + %6 = arith.shli %5, %cst : vector<4x2xi8> + %7 = arith.shrsi %6, %cst : vector<4x2xi8> + %8 = arith.shrsi %5, %cst : vector<4x2xi8> - // Ops that should be lowered - %9 = vector.interleave %7, %8 : vector<4x2xi8> -> vector<4x4xi8> - %14 = vector.bitcast %9 : vector<4x4xi8> to vector<4x8xi4> + // Ops that should be lowered + %9 = vector.interleave %7, %8 : vector<4x2xi8> -> vector<4x4xi8> + %14 = vector.bitcast %9 : vector<4x4xi8> to vector<4x8xi4> - vector.store %14, %out_buffer[%c0, %c0] : memref<256x64xi4, strided<[64, 1], offset: 8192>>, vector<4x8xi4> - return - } + vector.store %14, %out_buffer[%c0, %c0] : memref<256x64xi4, strided<[64, 1], offset: 8192>>, vector<4x8xi4> + return } // Make sure we can lower multi-dimensional `vector.interleave` and its diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/hal_interface_bindings.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/hal_interface_bindings.mlir index a48ac2a13ca0d..8675111e4cf6e 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/hal_interface_bindings.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/hal_interface_bindings.mlir @@ -1,5 +1,12 @@ // RUN: iree-opt --iree-convert-to-llvm --split-input-file %s | FileCheck %s --dump-input=always +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> + // CHECK-LABEL: llvm.func @binding_ptrs( func.func @binding_ptrs() { // CHECK-DAG: %[[C2:.+]] = llvm.mlir.constant(2 @@ -12,7 +19,7 @@ func.func @binding_ptrs() { // CHECK: %[[BASE_PTR:.+]] = llvm.load %[[ARRAY_PTR]] : !llvm.ptr -> !llvm.ptr %c72 = arith.constant 72 : index %c128 = arith.constant 128 : index - %memref = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c72) : memref>{%c128} + %memref = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) offset(%c72) : memref>{%c128} // CHECK: %[[OFFSET_PTR0:.+]] = llvm.getelementptr %[[BASE_PTR]][18] // CHECK: %[[OFFSET_D0:.+]] = llvm.mul %[[C5]], %[[C2]] @@ -33,6 +40,13 @@ llvm.func @sink(%arg0: f32) { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> + // CHECK-LABEL: llvm.func @binding_ptrs_dynamic( func.func @binding_ptrs_dynamic() { // CHECK-DAG: %[[C1:.+]] = llvm.mlir.constant(1 : @@ -46,7 +60,7 @@ func.func @binding_ptrs_dynamic() { // CHECK: %[[CONSTANT_BASEPTR:.+]] = llvm.extractvalue %[[STATE]][9] // CHECK: %[[OFFSET:.+]] = llvm.load %[[CONSTANT_BASEPTR]] // CHECK: %[[OFFSET_ZEXT:.+]] = llvm.zext %[[OFFSET]] - %offset = hal.interface.constant.load[0] : index + %offset = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index // CHECK: %[[STATE0:.+]] = llvm.load %arg1 // CHECK: %[[CONSTANT_BASEPTR:.+]] = llvm.extractvalue %[[STATE0]][9] @@ -58,15 +72,15 @@ func.func @binding_ptrs_dynamic() { // CHECK: %[[DIM2_PTR:.+]] = llvm.getelementptr %[[CONSTANT_BASEPTR0]][3] // CHECK: %[[DIM2:.+]] = llvm.load %[[DIM2_PTR]] // CHECK: %[[DIM2_ZEXT:.+]] = llvm.zext %[[DIM2]] - %dim0 = hal.interface.constant.load[1]: index - %dim1 = hal.interface.constant.load[2] : index - %dim2 = hal.interface.constant.load[3] : index + %dim0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1): index + %dim1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %dim2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index // CHECK: %[[STATE3:.+]] = llvm.load %arg1 // CHECK: %[[BINDING_PTRS:.+]] = llvm.extractvalue %[[STATE3]][10] // CHECK: %[[ARRAY_PTR:.+]] = llvm.getelementptr %[[BINDING_PTRS]][1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr // CHECK: %[[BASE_PTR:.+]] = llvm.load %[[ARRAY_PTR]] : !llvm.ptr -> !llvm.ptr - %memref = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%offset) : memref>{%dim0, %dim1, %dim2} + %memref = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) offset(%offset) : memref>{%dim0, %dim1, %dim2} // CHECK: %[[BASE_BIT_OFFSET:.+]] = llvm.mul %[[OFFSET_ZEXT]], %[[C8]] // CHECK: %[[BASE_OFFSET:.+]] = llvm.udiv %[[BASE_BIT_OFFSET]], %[[C32]] @@ -94,6 +108,13 @@ llvm.func @sink(%arg0: f32) { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> + // CHECK-LABEL: llvm.func @binding_ptrs_sub_byte_dynamic( func.func @binding_ptrs_sub_byte_dynamic() { // CHECK-DAG: %[[C8:.+]] = llvm.mlir.constant(8 : @@ -103,14 +124,14 @@ func.func @binding_ptrs_sub_byte_dynamic() { // CHECK: %[[CONSTANT_BASEPTR:.+]] = llvm.extractvalue %[[STATE]][9] // CHECK: %[[OFFSET:.+]] = llvm.load %[[CONSTANT_BASEPTR]] // CHECK: %[[OFFSET_ZEXT:.+]] = llvm.zext %[[OFFSET]] - %offset = hal.interface.constant.load[0] : index - %dim0 = hal.interface.constant.load[1]: index + %offset = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %dim0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1): index // CHECK: %[[STATE3:.+]] = llvm.load %arg1 // CHECK: %[[BINDING_PTRS:.+]] = llvm.extractvalue %[[STATE3]][10] // CHECK: %[[ARRAY_PTR:.+]] = llvm.getelementptr %[[BINDING_PTRS]][1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr // CHECK: %[[BASE_PTR:.+]] = llvm.load %[[ARRAY_PTR]] : !llvm.ptr -> !llvm.ptr - %memref = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%offset) : memref>{%dim0} + %memref = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) offset(%offset) : memref>{%dim0} // CHECK: %[[BASE_BIT_OFFSET:.+]] = llvm.mul %[[OFFSET_ZEXT]], %[[C8]] // CHECK: %[[BASE_OFFSET:.+]] = llvm.udiv %[[BASE_BIT_OFFSET]], %[[C4]] diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/hal_interface_constants.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/hal_interface_constants.mlir index eb8618520fda9..cccef04c46d45 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/hal_interface_constants.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/hal_interface_constants.mlir @@ -1,5 +1,11 @@ // RUN: iree-opt --iree-convert-to-llvm --split-input-file %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout + ]> +]> + // CHECK-LABEL: llvm.func @constant_values func.func @constant_values() { // CHECK: %[[STATE:.+]] = llvm.load %arg1 : !llvm.ptr -> !llvm.struct<"iree_hal_executable_dispatch_state_v0_t" @@ -7,7 +13,7 @@ func.func @constant_values() { // CHECK: %[[VPTR:.+]] = llvm.getelementptr %[[PTR_BASE]][1] : (!llvm.ptr) -> !llvm.ptr, i32 // CHECK: %[[V32:.+]] = llvm.load %[[VPTR]] : !llvm.ptr -> i32 // CHECK: %[[V64:.+]] = llvm.zext %[[V32]] : i32 to i64 - %v1 = hal.interface.constant.load[1] : index + %v1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index // CHECK-NOT: unrealized_conversion_cast %v2 = arith.index_cast %v1 : index to i64 // CHECK: llvm.call @sink diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir index d7cac72c039fb..7601ed03628c8 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/illegal_configuration.mlir @@ -1,116 +1,155 @@ // RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy)' --verify-diagnostics --split-input-file %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64"> #translation = #iree_codegen.translation_info -module { - func.func @illegal() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32> - // expected-error @+1 {{expected four tiling levels, got 0}} - linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>) - return - } +func.func @illegal() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<4x8xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<8x16xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<4x16xf32> + // expected-error @+1 {{expected four tiling levels, got 0}} + linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>) + return } - // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64"> -module { - func.func @illegal() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32> - // expected-error @+1 {{native_vector_size must be empty}} - linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>) - return - } +func.func @illegal() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<4x8xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<8x16xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<4x16xf32> + // expected-error @+1 {{native_vector_size must be empty}} + linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>) + return } - // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64"> module { func.func @illegal() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<4x8xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<8x16xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<4x16xf32> // expected-error @+1 {{expected only parallel dims to be set in the second tiling level, got 2-th tile size set}} linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>) return } } - // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64"> -module { - func.func @illegal() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32> - // expected-error @+1 {{only reduction dims to be set in the third tiling level, got 1-th tile size set}} - linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>) - return - } +func.func @illegal() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<4x8xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<8x16xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<4x16xf32> + // expected-error @+1 {{only reduction dims to be set in the third tiling level, got 1-th tile size set}} + linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>) + return } - // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64"> -module { - func.func @illegal() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32> - // expected-error @+1 {{expected [0, 2) to be set exactly once in interchange #0}} - linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>) - return - } +func.func @illegal() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<4x8xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<8x16xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<4x16xf32> + // expected-error @+1 {{expected [0, 2) to be set exactly once in interchange #0}} + linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>) + return } - // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64"> -module { - func.func @illegal() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<36x9x9x512xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<3x3x512x512xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<36x7x7x512xf32> - // expected-error @+1 {{can't decompose the conv op}} - linalg.conv_2d_nhwc_hwcf {lowering_config = #config} ins(%0, %1 : memref<36x9x9x512xf32>, memref<3x3x512x512xf32>) outs(%2 : memref<36x7x7x512xf32>) - return - } +func.func @illegal() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<36x9x9x512xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<3x3x512x512xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<36x7x7x512xf32> + // expected-error @+1 {{can't decompose the conv op}} + linalg.conv_2d_nhwc_hwcf {lowering_config = #config} ins(%0, %1 : memref<36x9x9x512xf32>, memref<3x3x512x512xf32>) outs(%2 : memref<36x7x7x512xf32>) + return } - // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64"> module { func.func @illegal() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #translation} { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1x11x11x576xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<5x5x576xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1x7x7x576xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<1x11x11x576xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<5x5x576xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<1x7x7x576xf32> // expected-error @+1 {{can't decompose the conv op}} linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, lowering_config = #config, strides = dense<1> : tensor<2xi64>} ins(%0, %1 : memref<1x11x11x576xf32>, memref<5x5x576xf32>) outs(%2 : memref<1x7x7x576xf32>) return diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/peel.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/peel.mlir index 8c0719d9873ea..10aa93a1a2443 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/peel.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/peel.mlir @@ -1,5 +1,12 @@ // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-llvmcpu-peel))" -split-input-file %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @peel_static_matmul() { %c16 = arith.constant 16 : index %c49 = arith.constant 49 : index @@ -9,9 +16,9 @@ func.func @peel_static_matmul() { %c512 = arith.constant 512 : index %c128 = arith.constant 128 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_arm_sme_streaming_mode_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_arm_sme_streaming_mode_tests.mlir index 3ff496a235f41..f9eae566f85b9 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_arm_sme_streaming_mode_tests.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_arm_sme_streaming_mode_tests.mlir @@ -1,6 +1,13 @@ // RUN: iree-opt --iree-codegen-linalg-to-llvm-pipeline=enable-arm-sme --split-input-file %s | FileCheck %s // RUN: iree-opt --iree-codegen-linalg-to-llvm-pipeline=enable-arm-sme --iree-llvmcpu-force-arm-streaming --split-input-file %s | FileCheck %s -check-prefixes=FORCE-ARM-STREAMING +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> module { module { func.func @fixed_size_dispatch() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve,+sme", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>, @@ -8,8 +15,8 @@ module { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = tensor.empty() : tensor<1xf32> %3 = linalg.fill {lowering_config = #iree_codegen.lowering_config} ins(%cst : f32) outs(%2 : tensor<1xf32>) -> tensor<1xf32> @@ -32,6 +39,13 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> module { module { func.func @scalable_dispatch() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve,+sme", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>, @@ -39,8 +53,8 @@ module { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = tensor.empty() : tensor<1xf32> %3 = linalg.fill {lowering_config = #iree_codegen.lowering_config} ins(%cst : f32) outs(%2 : tensor<1xf32>) -> tensor<1xf32> @@ -64,6 +78,13 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> module { module { func.func @scalable_dispatch_using_za() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve,+sme", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>, @@ -71,8 +92,8 @@ module { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = tensor.empty() : tensor<100x100xf32> %3 = linalg.fill {lowering_config = #iree_codegen.lowering_config} ins(%cst : f32) outs(%2 : tensor<100x100xf32>) -> tensor<100x100xf32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pack_unpack_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pack_unpack_tests.mlir index b4f45aabe48ce..e26a427d7bea1 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pack_unpack_tests.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pack_unpack_tests.mlir @@ -1,5 +1,12 @@ // RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))' --split-input-file %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1) -> (d1)> #map1 = affine_map<(d0, d1) -> (d0, d1)> @@ -8,9 +15,9 @@ module { %c0 = arith.constant 0 : index %cst = arith.constant 3.40282347E+38 : f32 %cst_0 = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [512], strides = [1] : !flow.dispatch.tensor> -> tensor<512xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<384x512xf32> %5 = tensor.empty() : tensor<24x512x16x1xf32> @@ -40,6 +47,13 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1) -> (d1)> #map1 = affine_map<(d0, d1) -> (d0, d1)> @@ -48,9 +62,9 @@ module { %c0 = arith.constant 0 : index %cst = arith.constant 3.40282347E+38 : f32 %cst_0 = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [24, 32, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<24x32x16x16xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [512], strides = [1] : !flow.dispatch.tensor> -> tensor<512xf32> %5 = tensor.empty() : tensor<384x512xf32> @@ -79,13 +93,19 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}> module { func.func @unaligned_pack() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [383, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<383x512xf32> %3 = tensor.empty() : tensor<24x512x16x1xf32> %pack = tensor.pack %2 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %3 : tensor<383x512xf32> -> tensor<24x512x16x1xf32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_conv_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_conv_tests.mlir index e425156bb35ec..b619559ffe3cf 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_conv_tests.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_conv_tests.mlir @@ -1,43 +1,48 @@ // RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))' --split-input-file %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @pad_conv_2d_nchw_fchw_1x320x64x64x320x3x3() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %c5243520 = arith.constant 5243520 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = hal.interface.constant.load[4] : i32 - %5 = arith.index_castui %0 {stream.alignment = 128 : index, stream.values = [10486400 : index, 15729280 : index]} : i32 to index - %6 = arith.index_castui %1 {stream.alignment = 256 : index, stream.values = [1273222400 : index, 1280618240 : index]} : i32 to index - %7 = arith.index_castui %2 {stream.alignment = 256 : index, stream.values = [10507520 : index, 21488640 : index]} : i32 to index - %8 = arith.index_castui %3 {stream.alignment = 256 : index, stream.values = [10508800 : index, 21489920 : index]} : i32 to index - %9 = arith.index_castui %4 {stream.alignment = 128 : index, stream.values = [10486400 : index, 10487680 : index]} : i32 to index - %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c5243520) flags(ReadOnly) : !flow.dispatch.tensor> - %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor> - %12 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%7) flags(ReadOnly) : !flow.dispatch.tensor> - %13 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor> - %14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor> - %15 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%9) : !flow.dispatch.tensor> - %16 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0, 0], sizes = [1, 320, 64, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x320x64x64xf32> - %17 = flow.dispatch.tensor.load %11, offsets = [0, 0, 0, 0], sizes = [320, 320, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<320x320x3x3xf32> - %18 = flow.dispatch.tensor.load %12, offsets = [0, 0], sizes = [1, 320], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x320xf32> - %19 = flow.dispatch.tensor.load %13, offsets = [0, 0], sizes = [1, 320], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x320xf32> - %20 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [1, 320], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x320xf32> - %21 = tensor.empty() : tensor<1x320x64x64xf32> - %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<1x320x64x64xf32>) -> tensor<1x320x64x64xf32> - %padded = tensor.pad %16 low[0, 0, 1, 1] high[0, 0, 1, 1] { - ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index): - tensor.yield %cst : f32 - } : tensor<1x320x64x64xf32> to tensor<1x320x66x66xf32> - %23 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded, %17 : tensor<1x320x66x66xf32>, tensor<320x320x3x3xf32>) outs(%22 : tensor<1x320x64x64xf32>) -> tensor<1x320x64x64xf32> - flow.dispatch.tensor.store %23, %15, offsets = [0, 0, 0, 0], sizes = [1, 320, 64, 64], strides = [1, 1, 1, 1] : tensor<1x320x64x64xf32> -> !flow.dispatch.tensor> - return - } +func.func @pad_conv_2d_nchw_fchw_1x320x64x64x320x3x3() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c5243520 = arith.constant 5243520 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 + %5 = arith.index_castui %0 {stream.alignment = 128 : index, stream.values = [10486400 : index, 15729280 : index]} : i32 to index + %6 = arith.index_castui %1 {stream.alignment = 256 : index, stream.values = [1273222400 : index, 1280618240 : index]} : i32 to index + %7 = arith.index_castui %2 {stream.alignment = 256 : index, stream.values = [10507520 : index, 21488640 : index]} : i32 to index + %8 = arith.index_castui %3 {stream.alignment = 256 : index, stream.values = [10508800 : index, 21489920 : index]} : i32 to index + %9 = arith.index_castui %4 {stream.alignment = 128 : index, stream.values = [10486400 : index, 10487680 : index]} : i32 to index + %10 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c5243520) flags(ReadOnly) : !flow.dispatch.tensor> + %11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor> + %12 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%7) flags(ReadOnly) : !flow.dispatch.tensor> + %13 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor> + %14 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor> + %15 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%9) : !flow.dispatch.tensor> + %16 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0, 0], sizes = [1, 320, 64, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x320x64x64xf32> + %17 = flow.dispatch.tensor.load %11, offsets = [0, 0, 0, 0], sizes = [320, 320, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<320x320x3x3xf32> + %18 = flow.dispatch.tensor.load %12, offsets = [0, 0], sizes = [1, 320], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x320xf32> + %19 = flow.dispatch.tensor.load %13, offsets = [0, 0], sizes = [1, 320], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x320xf32> + %20 = flow.dispatch.tensor.load %14, offsets = [0, 0], sizes = [1, 320], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x320xf32> + %21 = tensor.empty() : tensor<1x320x64x64xf32> + %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<1x320x64x64xf32>) -> tensor<1x320x64x64xf32> + %padded = tensor.pad %16 low[0, 0, 1, 1] high[0, 0, 1, 1] { + ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index): + tensor.yield %cst : f32 + } : tensor<1x320x64x64xf32> to tensor<1x320x66x66xf32> + %23 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded, %17 : tensor<1x320x66x66xf32>, tensor<320x320x3x3xf32>) outs(%22 : tensor<1x320x64x64xf32>) -> tensor<1x320x64x64xf32> + flow.dispatch.tensor.store %23, %15, offsets = [0, 0, 0, 0], sizes = [1, 320, 64, 64], strides = [1, 1, 1, 1] : tensor<1x320x64x64xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @pad_conv_2d_nchw_fchw_1x320x64x64x320x3x3 diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_tests.mlir index 045193a29cea1..ab8b261fbe300 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_tests.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_pad_tests.mlir @@ -1,21 +1,25 @@ // RUN: iree-opt --pass-pipeline="builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))" --split-input-file %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @pad_only_dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c634816 = arith.constant 634816 : index - %c3846080 = arith.constant 3846080 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c634816) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c3846080) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x112x112x64xf32> - %padded = tensor.pad %2 low[0, 1, 1, 0] high[0, 1, 1, 0] { - ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index): - tensor.yield %cst : f32 - } : tensor<1x112x112x64xf32> to tensor<1x114x114x64xf32> - flow.dispatch.tensor.store %padded, %1, offsets = [0, 0, 0, 0], sizes = [1, 114, 114, 64], strides = [1, 1, 1, 1] : tensor<1x114x114x64xf32> -> !flow.dispatch.tensor> - return - } +func.func @pad_only_dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c634816 = arith.constant 634816 : index + %c3846080 = arith.constant 3846080 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c634816) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c3846080) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x112x112x64xf32> + %padded = tensor.pad %2 low[0, 1, 1, 0] high[0, 1, 1, 0] { + ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index): + tensor.yield %cst : f32 + } : tensor<1x112x112x64xf32> to tensor<1x114x114x64xf32> + flow.dispatch.tensor.store %padded, %1, offsets = [0, 0, 0, 0], sizes = [1, 114, 114, 64], strides = [1, 1, 1, 1] : tensor<1x114x114x64xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func @pad_only_dispatch() @@ -42,39 +46,45 @@ module { // CHECK: vector.store %[[RESULT_VEC]], %[[DROP_UNIT_OUTPUT_SLICE]] // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d3)> -module { - func.func @pad_with_producer_dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c802816 = arith.constant 802816 : index - %c72545728 = arith.constant 72545728 : index - %c72676800 = arith.constant 72676800 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 1.001000e-05 : f32 - %cst_0 = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c802816) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c72545728) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c72676800) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 56, 56, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x56x56x256xf32> - %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [1, 1, 256, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x1x256x128xf32> - %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor> -> tensor<128xf32> - %7 = tensor.empty() : tensor<1x28x28x128xf32> - %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<1x28x28x128xf32>) -> tensor<1x28x28x128xf32> - %9 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%4, %5 : tensor<1x56x56x256xf32>, tensor<1x1x256x128xf32>) outs(%8 : tensor<1x28x28x128xf32>) -> tensor<1x28x28x128xf32> - %10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %6 : tensor<1x28x28x128xf32>, tensor<128xf32>) outs(%7 : tensor<1x28x28x128xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %11 = arith.addf %in, %in_1 : f32 - linalg.yield %11 : f32 - } -> tensor<1x28x28x128xf32> - %padded = tensor.pad %10 low[0, 1, 1, 0] high[0, 1, 1, 0] { - ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index): - tensor.yield %cst_0 : f32 - } : tensor<1x28x28x128xf32> to tensor<1x30x30x128xf32> - flow.dispatch.tensor.store %padded, %3, offsets = [0, 0, 0, 0], sizes = [1, 30, 30, 128], strides = [1, 1, 1, 1] : tensor<1x30x30x128xf32> -> !flow.dispatch.tensor> - return - } +func.func @pad_with_producer_dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c802816 = arith.constant 802816 : index + %c72545728 = arith.constant 72545728 : index + %c72676800 = arith.constant 72676800 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 1.001000e-05 : f32 + %cst_0 = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c802816) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c72545728) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c72676800) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 56, 56, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x56x56x256xf32> + %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [1, 1, 256, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x1x256x128xf32> + %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor> -> tensor<128xf32> + %7 = tensor.empty() : tensor<1x28x28x128xf32> + %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<1x28x28x128xf32>) -> tensor<1x28x28x128xf32> + %9 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%4, %5 : tensor<1x56x56x256xf32>, tensor<1x1x256x128xf32>) outs(%8 : tensor<1x28x28x128xf32>) -> tensor<1x28x28x128xf32> + %10 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %6 : tensor<1x28x28x128xf32>, tensor<128xf32>) outs(%7 : tensor<1x28x28x128xf32>) { + ^bb0(%in: f32, %in_1: f32, %out: f32): + %11 = arith.addf %in, %in_1 : f32 + linalg.yield %11 : f32 + } -> tensor<1x28x28x128xf32> + %padded = tensor.pad %10 low[0, 1, 1, 0] high[0, 1, 1, 0] { + ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index): + tensor.yield %cst_0 : f32 + } : tensor<1x28x28x128xf32> to tensor<1x30x30x128xf32> + flow.dispatch.tensor.store %padded, %3, offsets = [0, 0, 0, 0], sizes = [1, 30, 30, 128], strides = [1, 1, 1, 1] : tensor<1x30x30x128xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func @pad_with_producer_dispatch() @@ -117,25 +127,31 @@ module { // CHECK-SAME: outs(%[[INTERIOR_SLICE]] : // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @pad_consumer_fusion_dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 14, 14, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x14x14x256xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 256, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x256x256xf32> - %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [1, 14, 14, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x14x14x256xf32> - %padded = tensor.pad %3 low[0, 1, 1, 0] high[0, 1, 1, 0] { - ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index): - tensor.yield %cst : f32 - } : tensor<1x14x14x256xf32> to tensor<1x16x16x256xf32> - %6 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%padded, %4 : tensor<1x16x16x256xf32>, tensor<3x3x256x256xf32>) outs(%5 : tensor<1x14x14x256xf32>) -> tensor<1x14x14x256xf32> - flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [1, 14, 14, 256], strides = [1, 1, 1, 1] : tensor<1x14x14x256xf32> -> !flow.dispatch.tensor> - return - } +func.func @pad_consumer_fusion_dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 14, 14, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x14x14x256xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 256, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x256x256xf32> + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [1, 14, 14, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x14x14x256xf32> + %padded = tensor.pad %3 low[0, 1, 1, 0] high[0, 1, 1, 0] { + ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index): + tensor.yield %cst : f32 + } : tensor<1x14x14x256xf32> to tensor<1x16x16x256xf32> + %6 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%padded, %4 : tensor<1x16x16x256xf32>, tensor<3x3x256x256xf32>) outs(%5 : tensor<1x14x14x256xf32>) -> tensor<1x14x14x256xf32> + flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [1, 14, 14, 256], strides = [1, 1, 1, 1] : tensor<1x14x14x256xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func @pad_consumer_fusion_dispatch() diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_peel_and_vectorize_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_peel_and_vectorize_tests.mlir index a592284bf33fc..e8602dc6995aa 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_peel_and_vectorize_tests.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_peel_and_vectorize_tests.mlir @@ -1,22 +1,27 @@ // RUN: iree-opt --pass-pipeline='builtin.module(func.func(iree-llvmcpu-lower-executable-target))' -split-input-file %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info #executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {native_vector_size = 64}> -module { - func.func @no_peel_static_matmul() attributes {hal.executable.target = #executable_target_system_elf_x86_64_, translation_info = #translation} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x64xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [64, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x512xf32> - %5 = tensor.empty() : tensor<128x512xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x512xf32>) -> tensor<128x512xf32> - %7 = linalg.matmul {lowering_config = #config} ins(%3, %4 : tensor<128x64xf32>, tensor<64x512xf32>) outs(%6 : tensor<128x512xf32>) -> tensor<128x512xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor> - return - } +func.func @no_peel_static_matmul() attributes {hal.executable.target = #executable_target_system_elf_x86_64_, translation_info = #translation} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x64xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [64, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x512xf32> + %5 = tensor.empty() : tensor<128x512xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x512xf32>) -> tensor<128x512xf32> + %7 = linalg.matmul {lowering_config = #config} ins(%3, %4 : tensor<128x64xf32>, tensor<64x512xf32>) outs(%6 : tensor<128x512xf32>) -> tensor<128x512xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func @no_peel_static_matmul() @@ -28,23 +33,29 @@ module { // CHECK-NOT: scf.for // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info #executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {native_vector_size = 64}> -module { - func.func @peel_static_matmul() attributes {hal.executable.target = #executable_target_system_elf_x86_64_, translation_info = #translation} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 49], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x49xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [49, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<49x512xf32> - %5 = tensor.empty() : tensor<128x512xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x512xf32>) -> tensor<128x512xf32> - %7 = linalg.matmul {lowering_config = #config} ins(%3, %4 : tensor<128x49xf32>, tensor<49x512xf32>) outs(%6 : tensor<128x512xf32>) -> tensor<128x512xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor> - return - } +func.func @peel_static_matmul() attributes {hal.executable.target = #executable_target_system_elf_x86_64_, translation_info = #translation} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 49], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x49xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [49, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<49x512xf32> + %5 = tensor.empty() : tensor<128x512xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x512xf32>) -> tensor<128x512xf32> + %7 = linalg.matmul {lowering_config = #config} ins(%3, %4 : tensor<128x49xf32>, tensor<49x512xf32>) outs(%6 : tensor<128x512xf32>) -> tensor<128x512xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func @peel_static_matmul() @@ -68,29 +79,35 @@ module { // CHECK-NOT: scf.for // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info #executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {native_vector_size = 64}> -module { - func.func @peel_dynamic_matmul() attributes {hal.executable.target = #executable_target_system_elf_x86_64_, translation_info = #translation} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = arith.index_cast %0 : i32 to index - %4 = arith.index_cast %1 : i32 to index - %5 = arith.index_cast %2 : i32 to index - %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%4, %3} - %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%3, %5} - %8 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%4, %5} - %9 = flow.dispatch.tensor.load %6, offsets = [0, 0], sizes = [%4, %3], strides = [1, 1] : !flow.dispatch.tensor>{%4, %3} -> tensor - %10 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [%3, %5], strides = [1, 1] : !flow.dispatch.tensor>{%3, %5} -> tensor - %11 = tensor.empty(%4, %5) : tensor - %12 = linalg.fill ins(%cst : f32) outs(%11 : tensor) -> tensor - %13 = linalg.matmul {lowering_config = #config} ins(%9, %10 : tensor, tensor) outs(%12 : tensor) -> tensor - flow.dispatch.tensor.store %13, %8, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%4, %5} - return - } +func.func @peel_dynamic_matmul() attributes {hal.executable.target = #executable_target_system_elf_x86_64_, translation_info = #translation} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = arith.index_cast %0 : i32 to index + %4 = arith.index_cast %1 : i32 to index + %5 = arith.index_cast %2 : i32 to index + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%4, %3} + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%3, %5} + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%4, %5} + %9 = flow.dispatch.tensor.load %6, offsets = [0, 0], sizes = [%4, %3], strides = [1, 1] : !flow.dispatch.tensor>{%4, %3} -> tensor + %10 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [%3, %5], strides = [1, 1] : !flow.dispatch.tensor>{%3, %5} -> tensor + %11 = tensor.empty(%4, %5) : tensor + %12 = linalg.fill ins(%cst : f32) outs(%11 : tensor) -> tensor + %13 = linalg.matmul {lowering_config = #config} ins(%9, %10 : tensor, tensor) outs(%12 : tensor) -> tensor + flow.dispatch.tensor.store %13, %8, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%4, %5} + return } // CHECK-LABEL: func @peel_dynamic_matmul() @@ -122,21 +139,29 @@ module { // CHECK-NOT: scf.for // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> module { func.func @peel_scalable_matmul() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_, translation_info = #translation} { %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_cast %0 : i32 to index %4 = arith.index_cast %1 : i32 to index %5 = arith.index_cast %2 : i32 to index - %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%4, %3} - %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%3, %5} - %8 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%4, %5} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%4, %3} + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%3, %5} + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%4, %5} %9 = flow.dispatch.tensor.load %6, offsets = [0, 0], sizes = [%4, %3], strides = [1, 1] : !flow.dispatch.tensor>{%4, %3} -> tensor %10 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [%3, %5], strides = [1, 1] : !flow.dispatch.tensor>{%3, %5} -> tensor %11 = tensor.empty(%4, %5) : tensor diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_split_reduction_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_split_reduction_tests.mlir index 63ab24fc17b49..3d4f9c71253fa 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_split_reduction_tests.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_split_reduction_tests.mlir @@ -1,32 +1,36 @@ // RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))' --iree-llvmcpu-reassociate-fp-reductions=false --split-input-file %s | FileCheck %s // RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))' --iree-llvmcpu-reassociate-fp-reductions=true --split-input-file %s | FileCheck %s --check-prefix=REORDERCHECK +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> #map2 = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_supported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %cst = arith.constant dense<0> : tensor<1024x512xi32> - %c1_i32 = arith.constant 1 : i32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1024x512x256xi32> - %3 = tensor.empty() : tensor<1024x512xi32> - %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<1024x512x256xi32>) outs(%cst : tensor<1024x512xi32>) { - ^bb0(%in: i32, %out: i32): - %6 = arith.addi %in, %out : i32 - linalg.yield %6 : i32 - } -> tensor<1024x512xi32> - %5 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<1024x512xi32>) outs(%3 : tensor<1024x512xi32>) { - ^bb0(%in: i32, %out: i32): - %6 = arith.addi %in, %c1_i32 : i32 - linalg.yield %6 : i32 - } -> tensor<1024x512xi32> - flow.dispatch.tensor.store %5, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xi32> -> !flow.dispatch.tensor> - return - } +func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_supported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %cst = arith.constant dense<0> : tensor<1024x512xi32> + %c1_i32 = arith.constant 1 : i32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1024x512x256xi32> + %3 = tensor.empty() : tensor<1024x512xi32> + %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<1024x512x256xi32>) outs(%cst : tensor<1024x512xi32>) { + ^bb0(%in: i32, %out: i32): + %6 = arith.addi %in, %out : i32 + linalg.yield %6 : i32 + } -> tensor<1024x512xi32> + %5 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<1024x512xi32>) outs(%3 : tensor<1024x512xi32>) { + ^bb0(%in: i32, %out: i32): + %6 = arith.addi %in, %c1_i32 : i32 + linalg.yield %6 : i32 + } -> tensor<1024x512xi32> + flow.dispatch.tensor.store %5, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xi32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_supported() @@ -43,32 +47,37 @@ module { // CHECK: arith.addi %{{.+}}, %{{.+}} : vector<4xi32> // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> #map2 = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_float_supported_with_flag() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %cst = arith.constant dense<0.000000e+00> : tensor<1024x512xf32> - %cst_0 = arith.constant 1.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1024x512x256xf32> - %3 = tensor.empty() : tensor<1024x512xf32> - %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<1024x512x256xf32>) outs(%cst : tensor<1024x512xf32>) { - ^bb0(%in: f32, %out: f32): - %6 = arith.addf %in, %out : f32 - linalg.yield %6 : f32 - } -> tensor<1024x512xf32> - %5 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<1024x512xf32>) outs(%3 : tensor<1024x512xf32>) { - ^bb0(%in: f32, %out: f32): - %6 = arith.addf %in, %cst_0 : f32 - linalg.yield %6 : f32 - } -> tensor<1024x512xf32> - flow.dispatch.tensor.store %5, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xf32> -> !flow.dispatch.tensor> - return - } +func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_float_supported_with_flag() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %cst = arith.constant dense<0.000000e+00> : tensor<1024x512xf32> + %cst_0 = arith.constant 1.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1024x512x256xf32> + %3 = tensor.empty() : tensor<1024x512xf32> + %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<1024x512x256xf32>) outs(%cst : tensor<1024x512xf32>) { + ^bb0(%in: f32, %out: f32): + %6 = arith.addf %in, %out : f32 + linalg.yield %6 : f32 + } -> tensor<1024x512xf32> + %5 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<1024x512xf32>) outs(%3 : tensor<1024x512xf32>) { + ^bb0(%in: f32, %out: f32): + %6 = arith.addf %in, %cst_0 : f32 + linalg.yield %6 : f32 + } -> tensor<1024x512xf32> + flow.dispatch.tensor.store %5, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @split_reduction_innermost_reduction_no_dynamic_perfect_tiling_float_supported_with_flag() @@ -88,28 +97,33 @@ module { // REORDERCHECK: arith.addf %{{.+}}, %{{.+}} : vector<4xf32> // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> -module { - func.func @split_reduction_innermost_reduction_next_dynamic_supported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0_i32 = arith.constant 0 : i32 - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = arith.index_castui %0 : i32 to index - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%1} - %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%1} - %4 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [1024, %1, 256], strides = [1, 1, 1] : !flow.dispatch.tensor>{%1} -> tensor<1024x?x256xi32> - %5 = tensor.empty(%1) : tensor<1024x?xi32> - %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x?xi32>) -> tensor<1024x?xi32> - %7 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%4 : tensor<1024x?x256xi32>) outs(%6 : tensor<1024x?xi32>) { - ^bb0(%in: i32, %out: i32): - %8 = arith.addi %in, %out : i32 - linalg.yield %8 : i32 - } -> tensor<1024x?xi32> - flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [1024, %1], strides = [1, 1] : tensor<1024x?xi32> -> !flow.dispatch.tensor>{%1} - return - } +func.func @split_reduction_innermost_reduction_next_dynamic_supported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%1} + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%1} + %4 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [1024, %1, 256], strides = [1, 1, 1] : !flow.dispatch.tensor>{%1} -> tensor<1024x?x256xi32> + %5 = tensor.empty(%1) : tensor<1024x?xi32> + %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x?xi32>) -> tensor<1024x?xi32> + %7 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%4 : tensor<1024x?x256xi32>) outs(%6 : tensor<1024x?xi32>) { + ^bb0(%in: i32, %out: i32): + %8 = arith.addi %in, %out : i32 + linalg.yield %8 : i32 + } -> tensor<1024x?xi32> + flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [1024, %1], strides = [1, 1] : tensor<1024x?xi32> -> !flow.dispatch.tensor>{%1} + return } // CHECK-LABEL: func.func @split_reduction_innermost_reduction_next_dynamic_supported() @@ -125,24 +139,29 @@ module { // CHECK: vector.reduction , %{{.+}} %{{.+}} : vector<4xi32> into i32 // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> -module { - func.func @split_reduction_innermost_reduction_next_imperfect_tiling_supported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %cst = arith.constant dense<0> : tensor<1024x513xi32> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 513, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1024x513x256xi32> - %3 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<1024x513x256xi32>) outs(%cst : tensor<1024x513xi32>) { - ^bb0(%in: i32, %out: i32): - %4 = arith.addi %in, %out : i32 - linalg.yield %4 : i32 - } -> tensor<1024x513xi32> - flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [1024, 513], strides = [1, 1] : tensor<1024x513xi32> -> !flow.dispatch.tensor> - return - } +func.func @split_reduction_innermost_reduction_next_imperfect_tiling_supported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %cst = arith.constant dense<0> : tensor<1024x513xi32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 513, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1024x513x256xi32> + %3 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<1024x513x256xi32>) outs(%cst : tensor<1024x513xi32>) { + ^bb0(%in: i32, %out: i32): + %4 = arith.addi %in, %out : i32 + linalg.yield %4 : i32 + } -> tensor<1024x513xi32> + flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [1024, 513], strides = [1, 1] : tensor<1024x513xi32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @split_reduction_innermost_reduction_next_imperfect_tiling_supported() @@ -158,74 +177,89 @@ module { // CHECK: vector.reduction , %{{.+}} %{{.+}} : vector<4xi32> into i32 // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> -module { - func.func @split_reduction_innermost_dynamic_reduction_unsupported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant dense<0> : tensor<1024x512xi32> - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = arith.index_castui %0 : i32 to index - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%1} - %4 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [1024, 512, %1], strides = [1, 1, 1] : !flow.dispatch.tensor>{%1} -> tensor<1024x512x?xi32> - %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%4 : tensor<1024x512x?xi32>) outs(%cst : tensor<1024x512xi32>) { - ^bb0(%in: i32, %out: i32): - %6 = arith.addi %in, %out : i32 - linalg.yield %6 : i32 - } -> tensor<1024x512xi32> - flow.dispatch.tensor.store %5, %2, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xi32> -> !flow.dispatch.tensor> - return - } +func.func @split_reduction_innermost_dynamic_reduction_unsupported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant dense<0> : tensor<1024x512xi32> + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = arith.index_castui %0 : i32 to index + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%1} + %4 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [1024, 512, %1], strides = [1, 1, 1] : !flow.dispatch.tensor>{%1} -> tensor<1024x512x?xi32> + %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%4 : tensor<1024x512x?xi32>) outs(%cst : tensor<1024x512xi32>) { + ^bb0(%in: i32, %out: i32): + %6 = arith.addi %in, %out : i32 + linalg.yield %6 : i32 + } -> tensor<1024x512xi32> + flow.dispatch.tensor.store %5, %2, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xi32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @split_reduction_innermost_dynamic_reduction_unsupported() // CHECK-4: vector.mask %{{.*}} { vector.reduction // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> -module { - func.func @split_reduction_innermost_imperfect_reduction_unsupported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %cst = arith.constant dense<0> : tensor<1024x512xi32> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 257], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1024x512x257xi32> - %3 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<1024x512x257xi32>) outs(%cst : tensor<1024x512xi32>) { - ^bb0(%in: i32, %out: i32): - %4 = arith.addi %in, %out : i32 - linalg.yield %4 : i32 - } -> tensor<1024x512xi32> - flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xi32> -> !flow.dispatch.tensor> - return - } +func.func @split_reduction_innermost_imperfect_reduction_unsupported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %cst = arith.constant dense<0> : tensor<1024x512xi32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 257], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1024x512x257xi32> + %3 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<1024x512x257xi32>) outs(%cst : tensor<1024x512xi32>) { + ^bb0(%in: i32, %out: i32): + %4 = arith.addi %in, %out : i32 + linalg.yield %4 : i32 + } -> tensor<1024x512xi32> + flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xi32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @split_reduction_innermost_imperfect_reduction_unsupported() // CHECK-4: vector.mask %{{.*}} { vector.reduction // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1, d2) -> (d0, d2, d1)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> -module { - func.func @split_reduction_not_innermost_reduction_unsupported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %cst = arith.constant dense<0> : tensor<1024x256xi32> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1024x512x256xi32> - %3 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<1024x512x256xi32>) outs(%cst : tensor<1024x256xi32>) { - ^bb0(%in: i32, %out: i32): - %4 = arith.addi %in, %out : i32 - linalg.yield %4 : i32 - } -> tensor<1024x256xi32> - flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [1024, 256], strides = [1, 1] : tensor<1024x256xi32> -> !flow.dispatch.tensor> - return - } +func.func @split_reduction_not_innermost_reduction_unsupported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %cst = arith.constant dense<0> : tensor<1024x256xi32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1024x512x256xi32> + %3 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<1024x512x256xi32>) outs(%cst : tensor<1024x256xi32>) { + ^bb0(%in: i32, %out: i32): + %4 = arith.addi %in, %out : i32 + linalg.yield %4 : i32 + } -> tensor<1024x256xi32> + flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [1024, 256], strides = [1, 1] : tensor<1024x256xi32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @split_reduction_not_innermost_reduction_unsupported() @@ -233,24 +267,29 @@ module { // CHECK-NOT: vector.reduction // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0)> -module { - func.func @split_reduction_double_reduction_unsupported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %cst = arith.constant dense<0> : tensor<1024xi32> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1024x512x256xi32> - %3 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction", "reduction"]} ins(%2 : tensor<1024x512x256xi32>) outs(%cst : tensor<1024xi32>) { - ^bb0(%in: i32, %out: i32): - %4 = arith.addi %in, %out : i32 - linalg.yield %4 : i32 - } -> tensor<1024xi32> - flow.dispatch.tensor.store %3, %1, offsets = [0], sizes = [1024], strides = [1] : tensor<1024xi32> -> !flow.dispatch.tensor> - return - } +func.func @split_reduction_double_reduction_unsupported() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %cst = arith.constant dense<0> : tensor<1024xi32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1024x512x256xi32> + %3 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction", "reduction"]} ins(%2 : tensor<1024x512x256xi32>) outs(%cst : tensor<1024xi32>) { + ^bb0(%in: i32, %out: i32): + %4 = arith.addi %in, %out : i32 + linalg.yield %4 : i32 + } -> tensor<1024xi32> + flow.dispatch.tensor.store %3, %1, offsets = [0], sizes = [1024], strides = [1] : tensor<1024xi32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @split_reduction_double_reduction_unsupported() diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir index 4d6027ff5077f..f646cee2d08b8 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir @@ -6,36 +6,40 @@ // and the conversion to destination passing style. Running CSE // before hoists the fill and the empty out of the loop causing // issues with the conversion. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> #map2 = affine_map<(d0) -> (d0)> -module { - func.func @check_no_cse() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 3.840000e+02 : f32 - %cst_0 = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = arith.index_cast %0 {stream.alignment = 512 : index, stream.values = [0 : index, 10752 : index]} : i32 to index - %3 = arith.index_cast %1 {stream.alignment = 512 : index, stream.values = [10752 : index, 21504 : index]} : i32 to index - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%2) : !flow.dispatch.tensor> - %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%3) : !flow.dispatch.tensor> - %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [7, 384], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<7x384xf32> - %7 = tensor.empty() : tensor<7xf32> - %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<7xf32>) -> tensor<7xf32> - %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%6 : tensor<7x384xf32>) outs(%8 : tensor<7xf32>) { - ^bb0(%in: f32, %out: f32): - %11 = arith.addf %out, %in : f32 - linalg.yield %11 : f32 - } -> tensor<7xf32> - %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel"]} ins(%9 : tensor<7xf32>) outs(%7 : tensor<7xf32>) { - ^bb0(%in: f32, %out: f32): - %11 = arith.divf %in, %cst : f32 - linalg.yield %11 : f32 - } -> tensor<7xf32> - flow.dispatch.tensor.store %10, %5, offsets = [0], sizes = [7], strides = [1] : tensor<7xf32> -> !flow.dispatch.tensor> - return - } +func.func @check_no_cse() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 3.840000e+02 : f32 + %cst_0 = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = arith.index_cast %0 {stream.alignment = 512 : index, stream.values = [0 : index, 10752 : index]} : i32 to index + %3 = arith.index_cast %1 {stream.alignment = 512 : index, stream.values = [10752 : index, 21504 : index]} : i32 to index + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%2) : !flow.dispatch.tensor> + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%3) : !flow.dispatch.tensor> + %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [7, 384], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<7x384xf32> + %7 = tensor.empty() : tensor<7xf32> + %8 = linalg.fill ins(%cst_0 : f32) outs(%7 : tensor<7xf32>) -> tensor<7xf32> + %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%6 : tensor<7x384xf32>) outs(%8 : tensor<7xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = arith.addf %out, %in : f32 + linalg.yield %11 : f32 + } -> tensor<7xf32> + %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel"]} ins(%9 : tensor<7xf32>) outs(%7 : tensor<7xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = arith.divf %in, %cst : f32 + linalg.yield %11 : f32 + } -> tensor<7xf32> + flow.dispatch.tensor.store %10, %5, offsets = [0], sizes = [7], strides = [1] : tensor<7xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @check_no_cse() // CHECK-NOT: memref.alloc @@ -46,38 +50,44 @@ module { // CHECK: memref.store // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @peel_partially_unaligned_matmul() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = arith.index_castui %0 {stream.alignment = 128 : index, stream.values = [0 : index, 131712 : index]} : i32 to index - %5 = arith.index_castui %1 {stream.alignment = 64 : index, stream.values = [576704 : index, 1763072 : index]} : i32 to index - %6 = arith.index_castui %2 {stream.alignment = 64 : index, stream.values = [908480 : index, 2094848 : index]} : i32 to index - %7 = arith.index_castui %3 {stream.alignment = 128 : index, stream.values = [2304 : index, 134016 : index]} : i32 to index - %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) flags(ReadOnly) : !flow.dispatch.tensor> - %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor> - %10 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor> - %11 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%7) : !flow.dispatch.tensor> - %12 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [1, 576], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x576xf32> - %13 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [576, 144], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<576x144xf32> - %14 = flow.dispatch.tensor.load %10, offsets = [0, 0], sizes = [1, 144], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x144xf32> - %15 = tensor.empty() : tensor<1x144xf32> - %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<1x144xf32>) -> tensor<1x144xf32> - %17 = linalg.matmul ins(%12, %13 : tensor<1x576xf32>, tensor<576x144xf32>) outs(%16 : tensor<1x144xf32>) -> tensor<1x144xf32> - %18 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%17, %14 : tensor<1x144xf32>, tensor<1x144xf32>) outs(%15 : tensor<1x144xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %19 = arith.addf %in, %in_0 : f32 - %20 = arith.maximumf %19, %cst : f32 - linalg.yield %20 : f32 - } -> tensor<1x144xf32> - flow.dispatch.tensor.store %18, %11, offsets = [0, 0], sizes = [1, 144], strides = [1, 1] : tensor<1x144xf32> -> !flow.dispatch.tensor> - return - } +func.func @peel_partially_unaligned_matmul() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 + %4 = arith.index_castui %0 {stream.alignment = 128 : index, stream.values = [0 : index, 131712 : index]} : i32 to index + %5 = arith.index_castui %1 {stream.alignment = 64 : index, stream.values = [576704 : index, 1763072 : index]} : i32 to index + %6 = arith.index_castui %2 {stream.alignment = 64 : index, stream.values = [908480 : index, 2094848 : index]} : i32 to index + %7 = arith.index_castui %3 {stream.alignment = 128 : index, stream.values = [2304 : index, 134016 : index]} : i32 to index + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%4) flags(ReadOnly) : !flow.dispatch.tensor> + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor> + %10 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor> + %11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%7) : !flow.dispatch.tensor> + %12 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [1, 576], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x576xf32> + %13 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [576, 144], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<576x144xf32> + %14 = flow.dispatch.tensor.load %10, offsets = [0, 0], sizes = [1, 144], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x144xf32> + %15 = tensor.empty() : tensor<1x144xf32> + %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<1x144xf32>) -> tensor<1x144xf32> + %17 = linalg.matmul ins(%12, %13 : tensor<1x576xf32>, tensor<576x144xf32>) outs(%16 : tensor<1x144xf32>) -> tensor<1x144xf32> + %18 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%17, %14 : tensor<1x144xf32>, tensor<1x144xf32>) outs(%15 : tensor<1x144xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %19 = arith.addf %in, %in_0 : f32 + %20 = arith.maximumf %19, %cst : f32 + linalg.yield %20 : f32 + } -> tensor<1x144xf32> + flow.dispatch.tensor.store %18, %11, offsets = [0, 0], sizes = [1, 144], strides = [1, 1] : tensor<1x144xf32> -> !flow.dispatch.tensor> + return } // Checks that the bounded stack allocation are created. // CHECK-LABEL: func.func @peel_partially_unaligned_matmul @@ -92,56 +102,67 @@ module { // CHECK: arith.maximumf {{.*}} : vector< // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @batch_matmul_dynamic() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = hal.interface.constant.load[4] : i32 - %5 = hal.interface.constant.load[5] : i32 - %6 = arith.index_cast %0 : i32 to index - %7 = arith.index_cast %1 : i32 to index - %8 = arith.index_cast %2 : i32 to index - %9 = arith.index_cast %3 : i32 to index - %10 = arith.index_cast %4 : i32 to index - %11 = arith.index_cast %5 : i32 to index - %12 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%6, %7, %9} - %13 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%10, %11, %8} - %14 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%6, %7, %8} - %15 = flow.dispatch.tensor.load %12, offsets = [0, 0, 0], sizes = [%6, %7, %9], strides = [1, 1, 1] : !flow.dispatch.tensor>{%6, %7, %9} -> tensor - %16 = flow.dispatch.tensor.load %13, offsets = [0, 0, 0], sizes = [%10, %11, %8], strides = [1, 1, 1] : !flow.dispatch.tensor>{%10, %11, %8} -> tensor - %17 = tensor.empty(%6, %7, %8) : tensor - %18 = linalg.fill ins(%cst : f32) outs(%17 : tensor) -> tensor - %19 = linalg.batch_matmul ins(%15, %16 : tensor, tensor) outs(%18 : tensor) -> tensor - flow.dispatch.tensor.store %19, %14, offsets = [0, 0, 0], sizes = [%6, %7, %8], strides = [1, 1, 1] : tensor -> !flow.dispatch.tensor>{%6, %7, %8} - return - } +func.func @batch_matmul_dynamic() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 + %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32 + %6 = arith.index_cast %0 : i32 to index + %7 = arith.index_cast %1 : i32 to index + %8 = arith.index_cast %2 : i32 to index + %9 = arith.index_cast %3 : i32 to index + %10 = arith.index_cast %4 : i32 to index + %11 = arith.index_cast %5 : i32 to index + %12 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%6, %7, %9} + %13 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%10, %11, %8} + %14 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%6, %7, %8} + %15 = flow.dispatch.tensor.load %12, offsets = [0, 0, 0], sizes = [%6, %7, %9], strides = [1, 1, 1] : !flow.dispatch.tensor>{%6, %7, %9} -> tensor + %16 = flow.dispatch.tensor.load %13, offsets = [0, 0, 0], sizes = [%10, %11, %8], strides = [1, 1, 1] : !flow.dispatch.tensor>{%10, %11, %8} -> tensor + %17 = tensor.empty(%6, %7, %8) : tensor + %18 = linalg.fill ins(%cst : f32) outs(%17 : tensor) -> tensor + %19 = linalg.batch_matmul ins(%15, %16 : tensor, tensor) outs(%18 : tensor) -> tensor + flow.dispatch.tensor.store %19, %14, offsets = [0, 0, 0], sizes = [%6, %7, %8], strides = [1, 1, 1] : tensor -> !flow.dispatch.tensor>{%6, %7, %8} + return } // CHECK-LABEL: func.func @batch_matmul_dynamic // CHECK: vector.fma // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1) -> (d0 * 1536 + d1)> #map1 = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @check_buffer_ops_vectorization() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<128x1024xi32> - memref.assume_alignment %0, 64 : memref<128x1024xi32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<128x1536xi32> - memref.assume_alignment %1, 64 : memref<128x1536xi32> - %subview = memref.subview %1[0, 0] [128, 1024] [1, 1] : memref<128x1536xi32> to memref<128x1024xi32, #map> - linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%0 : memref<128x1024xi32>) outs(%subview : memref<128x1024xi32, #map>) { - ^bb0(%in: i32, %out: i32): - linalg.yield %in : i32 - } - return +func.func @check_buffer_ops_vectorization() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<128x1024xi32> + memref.assume_alignment %0, 64 : memref<128x1024xi32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<128x1536xi32> + memref.assume_alignment %1, 64 : memref<128x1536xi32> + %subview = memref.subview %1[0, 0] [128, 1024] [1, 1] : memref<128x1536xi32> to memref<128x1024xi32, #map> + linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%0 : memref<128x1024xi32>) outs(%subview : memref<128x1024xi32, #map>) { + ^bb0(%in: i32, %out: i32): + linalg.yield %in : i32 } + return } // CHECK-LABEL: #{{.+}} = #iree_codegen.translation_info, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1, d2, d3) -> (d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -module { - func.func @vectorize_fill_conv2d_generic() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %cst_0 = arith.constant 3.000000e+00 : f32 - %cst_1 = arith.constant 6.000000e+00 : f32 - %cst_2 = arith.constant 0.166666672 : f32 - %cst_3 = arith.constant dense<0.000000e+00> : tensor<16xf32> - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x225x225x3xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x3x16xf32> - %5 = tensor.empty() : tensor<1x112x112x16xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32> - %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x16xf32>) outs(%6 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32> - %8 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_3, %7 : tensor<16xf32>, tensor<1x112x112x16xf32>) outs(%5 : tensor<1x112x112x16xf32>) { - ^bb0(%in: f32, %in_4: f32, %out: f32): - %9 = arith.addf %in, %in_4 : f32 - %10 = arith.addf %9, %cst_0 : f32 - %11 = arith.cmpf olt, %10, %cst : f32 - %12 = arith.select %11, %cst, %10 : f32 - %13 = arith.cmpf olt, %cst_1, %10 : f32 - %14 = arith.select %13, %cst_1, %12 : f32 - %15 = arith.mulf %9, %14 : f32 - %16 = arith.mulf %15, %cst_2 : f32 - linalg.yield %16 : f32 - } -> tensor<1x112x112x16xf32> - flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 16], strides = [1, 1, 1, 1] : tensor<1x112x112x16xf32> -> !flow.dispatch.tensor> - return - } +func.func @vectorize_fill_conv2d_generic() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 3.000000e+00 : f32 + %cst_1 = arith.constant 6.000000e+00 : f32 + %cst_2 = arith.constant 0.166666672 : f32 + %cst_3 = arith.constant dense<0.000000e+00> : tensor<16xf32> + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x225x225x3xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x3x16xf32> + %5 = tensor.empty() : tensor<1x112x112x16xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32> + %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x16xf32>) outs(%6 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32> + %8 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_3, %7 : tensor<16xf32>, tensor<1x112x112x16xf32>) outs(%5 : tensor<1x112x112x16xf32>) { + ^bb0(%in: f32, %in_4: f32, %out: f32): + %9 = arith.addf %in, %in_4 : f32 + %10 = arith.addf %9, %cst_0 : f32 + %11 = arith.cmpf olt, %10, %cst : f32 + %12 = arith.select %11, %cst, %10 : f32 + %13 = arith.cmpf olt, %cst_1, %10 : f32 + %14 = arith.select %13, %cst_1, %12 : f32 + %15 = arith.mulf %9, %14 : f32 + %16 = arith.mulf %15, %cst_2 : f32 + linalg.yield %16 : f32 + } -> tensor<1x112x112x16xf32> + flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 16], strides = [1, 1, 1, 1] : tensor<1x112x112x16xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @vectorize_fill_conv2d_generic @@ -193,35 +220,44 @@ module { // CHECK: arith.cmpf olt, %{{.+}}, %{{.+}} : vector<4x4xf32> // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer>, + #hal.descriptor_set.binding<5, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d1)> -module { - func.func @multi_result() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %cst_0 = arith.constant 1.000000e-03 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %5 = hal.interface.binding.subspan set(0) binding(5) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %6 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x128xf32> - %7 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf32> - %8 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor> -> tensor<256xf32> - %9 = tensor.empty() : tensor<64x256xf32> - %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32> - %11 = linalg.matmul ins(%6, %7 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%10 : tensor<64x256xf32>) -> tensor<64x256xf32> - %12 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%11, %8 : tensor<64x256xf32>, tensor<256xf32>) outs(%9 : tensor<64x256xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %13 = arith.addf %in, %in_1 : f32 - linalg.yield %13 : f32 - } -> tensor<64x256xf32> - flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor> - flow.dispatch.tensor.store %12, %5, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor> - return - } +func.func @multi_result() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 1.000000e-03 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(5) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %6 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x128xf32> + %7 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf32> + %8 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor> -> tensor<256xf32> + %9 = tensor.empty() : tensor<64x256xf32> + %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<64x256xf32>) -> tensor<64x256xf32> + %11 = linalg.matmul ins(%6, %7 : tensor<64x128xf32>, tensor<128x256xf32>) outs(%10 : tensor<64x256xf32>) -> tensor<64x256xf32> + %12 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%11, %8 : tensor<64x256xf32>, tensor<256xf32>) outs(%9 : tensor<64x256xf32>) { + ^bb0(%in: f32, %in_1: f32, %out: f32): + %13 = arith.addf %in, %in_1 : f32 + linalg.yield %13 : f32 + } -> tensor<64x256xf32> + flow.dispatch.tensor.store %11, %4, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor> + flow.dispatch.tensor.store %12, %5, offsets = [0, 0], sizes = [64, 256], strides = [1, 1] : tensor<64x256xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func @multi_result // CHECK: scf.for @@ -232,20 +268,25 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf", ukernels = "mmt4d"}> -module { - func.func @ukernel_dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 4, 8, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x4x8x32xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [16, 4, 16, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x4x16x32xf32> - %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [2, 16, 8, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x16x8x16xf32> - %6 = linalg.mmt4d ins(%3, %4 : tensor<2x4x8x32xf32>, tensor<16x4x16x32xf32>) outs(%5 : tensor<2x16x8x16xf32>) -> tensor<2x16x8x16xf32> - flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 16, 8, 16], strides = [1, 1, 1, 1] : tensor<2x16x8x16xf32> -> !flow.dispatch.tensor> - return - } +func.func @ukernel_dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 4, 8, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x4x8x32xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [16, 4, 16, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x4x16x32xf32> + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [2, 16, 8, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x16x8x16xf32> + %6 = linalg.mmt4d ins(%3, %4 : tensor<2x4x8x32xf32>, tensor<16x4x16x32xf32>) outs(%5 : tensor<2x16x8x16xf32>) -> tensor<2x16x8x16xf32> + flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 16, 8, 16], strides = [1, 1, 1, 1] : tensor<2x16x8x16xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func @ukernel_dispatch() // Checks scf.for for distribution loops. @@ -259,37 +300,43 @@ module { // CHECK: iree_codegen.ukernel.generic "iree_uk_mmt4d" // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf", ukernels = "all"}> #map = affine_map<()[s0, s1, s2] -> (s0 - s1 * (s0 ceildiv s2), s0 ceildiv s2)> #map1 = affine_map<()[s0, s1, s2] -> (s0 * (s1 ceildiv s2))> -module { - func.func @dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = arith.index_castui %0 : i32 to index - %3 = arith.index_castui %1 : i32 to index - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%2} - %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%3} - %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%2} - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %7 = affine.min #map()[%2, %workgroup_id_x, %workgroup_count_x] - %8 = affine.apply #map1()[%workgroup_id_x, %2, %workgroup_count_x] - %9 = flow.dispatch.tensor.load %4, offsets = [%8], sizes = [%7], strides = [1] : !flow.dispatch.tensor>{%2} -> tensor - %10 = flow.dispatch.tensor.load %5, offsets = [%8], sizes = [%7], strides = [1] : !flow.dispatch.tensor>{%3} -> tensor - %11 = tensor.empty(%7) : tensor - %12 = iree_codegen.ukernel.generic "simple_mul_workgroup" ins(%9, %10 : tensor, tensor) outs(%11 : tensor) (%7 : index) -> tensor - flow.dispatch.tensor.store %12, %6, offsets = [%8], sizes = [%7], strides = [1] : tensor -> !flow.dispatch.tensor>{%2} - return - } +func.func @dispatch() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = arith.index_castui %0 : i32 to index + %3 = arith.index_castui %1 : i32 to index + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%2} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%3} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%2} + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %7 = affine.min #map()[%2, %workgroup_id_x, %workgroup_count_x] + %8 = affine.apply #map1()[%workgroup_id_x, %2, %workgroup_count_x] + %9 = flow.dispatch.tensor.load %4, offsets = [%8], sizes = [%7], strides = [1] : !flow.dispatch.tensor>{%2} -> tensor + %10 = flow.dispatch.tensor.load %5, offsets = [%8], sizes = [%7], strides = [1] : !flow.dispatch.tensor>{%3} -> tensor + %11 = tensor.empty(%7) : tensor + %12 = iree_codegen.ukernel.generic "simple_mul_workgroup" ins(%9, %10 : tensor, tensor) outs(%11 : tensor) (%7 : index) -> tensor + flow.dispatch.tensor.store %12, %6, offsets = [%8], sizes = [%7], strides = [1] : tensor -> !flow.dispatch.tensor>{%2} + return } // CHECK: func @dispatch -// CHECK: %[[INPUT0:.+]] = hal.interface.binding.subspan set(0) binding(0) +// CHECK: %[[INPUT0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK-SAME: memref> -// CHECK: %[[INPUT1:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[INPUT1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-SAME: memref> -// CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-SAME: memref> // CHECK-DAG: %[[OFFSET:.+]] = affine.apply // CHECK-DAG: %[[SIZE:.+]] = affine.min @@ -301,26 +348,32 @@ module { // CHECK-SAME: outs(%[[SUBVIEW_OUTPUT]] // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #config1 = #iree_codegen.lowering_config #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+fma,+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> -module { - func.func @unsupported_ukernel_fallback_to_vectorization() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #iree_codegen.translation_info} { - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %c132096 = arith.constant 132096 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c1024) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c132096) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 256, 1, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x256x1x1xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [4, 256, 128, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x256x128x1xi8> - %5 = tensor.empty() : tensor<1x4x1x128xf32> - %6 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%5 : tensor<1x4x1x128xf32>) -> tensor<1x4x1x128xf32> - %7 = linalg.mmt4d {lowering_config = #config1} ins(%3, %4 : tensor<1x256x1x1xf32>, tensor<4x256x128x1xi8>) outs(%6 : tensor<1x4x1x128xf32>) -> tensor<1x4x1x128xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 4, 1, 128], strides = [1, 1, 1, 1] : tensor<1x4x1x128xf32> -> !flow.dispatch.tensor> - return - } +func.func @unsupported_ukernel_fallback_to_vectorization() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_, translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %c1024 = arith.constant 1024 : index + %c132096 = arith.constant 132096 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c1024) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c132096) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 256, 1, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x256x1x1xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [4, 256, 128, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x256x128x1xi8> + %5 = tensor.empty() : tensor<1x4x1x128xf32> + %6 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%5 : tensor<1x4x1x128xf32>) -> tensor<1x4x1x128xf32> + %7 = linalg.mmt4d {lowering_config = #config1} ins(%3, %4 : tensor<1x256x1x1xf32>, tensor<4x256x128x1xi8>) outs(%6 : tensor<1x4x1x128xf32>) -> tensor<1x4x1x128xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 4, 1, 128], strides = [1, 1, 1, 1] : tensor<1x4x1x128xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @unsupported_ukernel_fallback_to_vectorization // CHECK: vector.fma diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_transpose_avx2_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_transpose_avx2_tests.mlir index f9692d1794251..97ef7fc30bd02 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_transpose_avx2_tests.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_transpose_avx2_tests.mlir @@ -1,23 +1,27 @@ // RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))' --split-input-file %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1) -> (d1, d0)> #map1 = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @transpose_10_8x8_pattern() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x1024xf32> - %3 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x512xf32> - %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<512x1024xf32>) outs(%3 : tensor<1024x512xf32>) { - ^bb0(%in: f32, %out: f32): - linalg.yield %in : f32 - } -> tensor<1024x512xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xf32> -> !flow.dispatch.tensor> - return - } +func.func @transpose_10_8x8_pattern() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x1024xf32> + %3 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x512xf32> + %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<512x1024xf32>) outs(%3 : tensor<1024x512xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1024x512xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @transpose_10_8x8_pattern @@ -32,24 +36,29 @@ module { // CHECK-COUNT-8: vector.store // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1, d2) -> (d0, d2, d1)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> -module { - func.func @transpose_021_8x8_pattern() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 96, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x96x128xf32> - %3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 128, 96], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x128x96xf32> - %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<64x96x128xf32>) outs(%3 : tensor<64x128x96xf32>) { - ^bb0(%in: f32, %out: f32): - linalg.yield %in : f32 - } -> tensor<64x128x96xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [64, 128, 96], strides = [1, 1, 1] : tensor<64x128x96xf32> -> !flow.dispatch.tensor> - return - } +func.func @transpose_021_8x8_pattern() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 96, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x96x128xf32> + %3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 128, 96], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x128x96xf32> + %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<64x96x128xf32>) outs(%3 : tensor<64x128x96xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<64x128x96xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [64, 128, 96], strides = [1, 1, 1] : tensor<64x128x96xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @transpose_021_8x8_pattern @@ -64,24 +73,29 @@ module { // CHECK-COUNT-8: vector.store // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1, d2) -> (d1, d2, d0)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> -module { - func.func @transpose_201_8x8_pattern() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 96, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x96x128xf32> - %3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [128, 64, 96], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x64x96xf32> - %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<64x96x128xf32>) outs(%3 : tensor<128x64x96xf32>) { - ^bb0(%in: f32, %out: f32): - linalg.yield %in : f32 - } -> tensor<128x64x96xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [128, 64, 96], strides = [1, 1, 1] : tensor<128x64x96xf32> -> !flow.dispatch.tensor> - return - } +func.func @transpose_201_8x8_pattern() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 96, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x96x128xf32> + %3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [128, 64, 96], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x64x96xf32> + %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<64x96x128xf32>) outs(%3 : tensor<128x64x96xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<128x64x96xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [128, 64, 96], strides = [1, 1, 1] : tensor<128x64x96xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @transpose_201_8x8_pattern @@ -96,24 +110,29 @@ module { // CHECK-COUNT-8: vector.store // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1, d2) -> (d2, d1, d0)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> -module { - func.func @transpose_210_8x8_pattern() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 96, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x96x128xf32> - %3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [128, 96, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x96x64xf32> - %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<64x96x128xf32>) outs(%3 : tensor<128x96x64xf32>) { - ^bb0(%in: f32, %out: f32): - linalg.yield %in : f32 - } -> tensor<128x96x64xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [128, 96, 64], strides = [1, 1, 1] : tensor<128x96x64xf32> -> !flow.dispatch.tensor> - return - } +func.func @transpose_210_8x8_pattern() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 96, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x96x128xf32> + %3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [128, 96, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x96x64xf32> + %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<64x96x128xf32>) outs(%3 : tensor<128x96x64xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<128x96x64xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [128, 96, 64], strides = [1, 1, 1] : tensor<128x96x64xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @transpose_210_8x8_pattern @@ -128,24 +147,29 @@ module { // CHECK-COUNT-8: vector.store // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1, d2) -> (d2, d0, d1)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> -module { - func.func @transpose_120_8x8_pattern() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 96, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x96x128xf32> - %3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [96, 128, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<96x128x64xf32> - %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<64x96x128xf32>) outs(%3 : tensor<96x128x64xf32>) { - ^bb0(%in: f32, %out: f32): - linalg.yield %in : f32 - } -> tensor<96x128x64xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [96, 128, 64], strides = [1, 1, 1] : tensor<96x128x64xf32> -> !flow.dispatch.tensor> - return - } +func.func @transpose_120_8x8_pattern() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 96, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x96x128xf32> + %3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [96, 128, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<96x128x64xf32> + %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<64x96x128xf32>) outs(%3 : tensor<96x128x64xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<96x128x64xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [96, 128, 64], strides = [1, 1, 1] : tensor<96x128x64xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @transpose_120_8x8_pattern @@ -160,48 +184,59 @@ module { // CHECK-COUNT-8: vector.store // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1, d2) -> (d1, d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> -module { - func.func @transpose_102() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 96, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x96x128xf32> - %3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [96, 64, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<96x64x128xf32> - %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<64x96x128xf32>) outs(%3 : tensor<96x64x128xf32>) { - ^bb0(%in: f32, %out: f32): - linalg.yield %in : f32 - } -> tensor<96x64x128xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [96, 64, 128], strides = [1, 1, 1] : tensor<96x64x128xf32> -> !flow.dispatch.tensor> - return - } +func.func @transpose_102() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 96, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x96x128xf32> + %3 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [96, 64, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<96x64x128xf32> + %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<64x96x128xf32>) outs(%3 : tensor<96x64x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<96x64x128xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [96, 64, 128], strides = [1, 1, 1] : tensor<96x64x128xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @transpose_102 // CHECK-NOT: vector.shuffle %{{.*}}, %{{.*}} [0, 8, 1, 9, 4, 12, 5, 13] : vector<8xf32>, vector<8xf32> // CHECK-NOT: llvm.inline_asm asm_dialect = intel "vblendps $0, $1, $2, 0xcc", "=x,x,x" %{{.*}}, %{{.*}} : (vector<8xf32>, vector<8xf32>) -> vector<8xf32> + // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1) -> (d1, d0)> #map1 = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @test_no_avx2_feature() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x1024xf32> - %3 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x512xf32> - %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<512x1024xf32>) outs(%3 : tensor<1024x512xf32>) { - ^bb0(%in: f32, %out: f32): - linalg.yield %in : f32 - } -> tensor<1024x512xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xf32> -> !flow.dispatch.tensor> - return - } +func.func @test_no_avx2_feature() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x1024xf32> + %3 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x512xf32> + %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<512x1024xf32>) outs(%3 : tensor<1024x512xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1024x512xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @test_no_avx2_feature diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_vector_masking_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_vector_masking_tests.mlir index 9927fa72cc17f..c93ab703ac8af 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_vector_masking_tests.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_vector_masking_tests.mlir @@ -1,29 +1,34 @@ // RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))' -split-input-file %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-linux-gnu"}> #map = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @mask_dynamic_generic_add() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = arith.index_cast %0 : i32 to index - %3 = arith.index_cast %1 : i32 to index - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor - %8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor - %9 = tensor.empty(%2, %3) : tensor - %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor) -> tensor - %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%7, %8 : tensor, tensor) outs(%10 : tensor) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %12 = arith.addf %in, %in_0 : f32 - linalg.yield %12 : f32 - } -> tensor - flow.dispatch.tensor.store %11, %6, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%2, %3} - return - } +func.func @mask_dynamic_generic_add() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = arith.index_cast %0 : i32 to index + %3 = arith.index_cast %1 : i32 to index + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%2, %3} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %3} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%2, %3} + %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor + %8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor + %9 = tensor.empty(%2, %3) : tensor + %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor) -> tensor + %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%7, %8 : tensor, tensor) outs(%10 : tensor) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %12 = arith.addf %in, %in_0 : f32 + linalg.yield %12 : f32 + } -> tensor + flow.dispatch.tensor.store %11, %6, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%2, %3} + return } // Masking is applied to the main vector loop when peeling is not used. @@ -37,29 +42,34 @@ module { // CHECK-NOT: scf.for // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-linux-gnu"}> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> -module { - func.func @mask_dynamic_reduction() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = arith.index_cast %0 : i32 to index - %3 = arith.index_cast %1 : i32 to index - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2} - %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor - %7 = tensor.empty(%2) : tensor - %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor) -> tensor - %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%6 : tensor) outs(%8 : tensor) { - ^bb0(%in: f32, %out: f32): - %10 = arith.addf %out, %in : f32 - linalg.yield %10 : f32 - } -> tensor - flow.dispatch.tensor.store %9, %5, offsets = [0], sizes = [%2], strides = [1] : tensor -> !flow.dispatch.tensor>{%2} - return - } +func.func @mask_dynamic_reduction() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = arith.index_cast %0 : i32 to index + %3 = arith.index_cast %1 : i32 to index + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%2, %3} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2} + %6 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor + %7 = tensor.empty(%2) : tensor + %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor) -> tensor + %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%6 : tensor) outs(%8 : tensor) { + ^bb0(%in: f32, %out: f32): + %10 = arith.addf %out, %in : f32 + linalg.yield %10 : f32 + } -> tensor + flow.dispatch.tensor.store %9, %5, offsets = [0], sizes = [%2], strides = [1] : tensor -> !flow.dispatch.tensor>{%2} + return } // CHECK-LABEL: func.func @mask_dynamic_reduction @@ -67,30 +77,36 @@ module { // CHECK: vector.mask %{{.*}} { vector.reduction // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_riscv_32_ = #hal.executable.target<"llvm-cpu", "embedded-elf-riscv_32", {data_layout = "e-m:e-p:32:32-i64:64-n32-S128", native_vector_size = 32 : index, target_triple = "riscv32-none-elf"}> #map = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @mask_dynamic_generic_add() attributes {hal.executable.target = #executable_target_embedded_elf_riscv_32_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = arith.index_cast %0 : i32 to index - %3 = arith.index_cast %1 : i32 to index - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor - %8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor - %9 = tensor.empty(%2, %3) : tensor - %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor) -> tensor - %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%7, %8 : tensor, tensor) outs(%10 : tensor) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %12 = arith.addf %in, %in_0 : f32 - linalg.yield %12 : f32 - } -> tensor - flow.dispatch.tensor.store %11, %6, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%2, %3} - return - } +func.func @mask_dynamic_generic_add() attributes {hal.executable.target = #executable_target_embedded_elf_riscv_32_} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = arith.index_cast %0 : i32 to index + %3 = arith.index_cast %1 : i32 to index + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%2, %3} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %3} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%2, %3} + %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor + %8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor + %9 = tensor.empty(%2, %3) : tensor + %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor) -> tensor + %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%7, %8 : tensor, tensor) outs(%10 : tensor) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %12 = arith.addf %in, %in_0 : f32 + linalg.yield %12 : f32 + } -> tensor + flow.dispatch.tensor.store %11, %6, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%2, %3} + return } // Masking is applied to the main vector loop when peeling is not used. @@ -104,30 +120,36 @@ module { // CHECK-NOT: scf.for // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> #map = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @mask_dynamic_generic_add() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = arith.index_cast %0 : i32 to index - %3 = arith.index_cast %1 : i32 to index - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor - %8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor - %9 = tensor.empty(%2, %3) : tensor - %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor) -> tensor - %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%7, %8 : tensor, tensor) outs(%10 : tensor) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %12 = arith.addf %in, %in_0 : f32 - linalg.yield %12 : f32 - } -> tensor - flow.dispatch.tensor.store %11, %6, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%2, %3} - return - } +func.func @mask_dynamic_generic_add() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = arith.index_cast %0 : i32 to index + %3 = arith.index_cast %1 : i32 to index + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%2, %3} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %3} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%2, %3} + %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor + %8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor + %9 = tensor.empty(%2, %3) : tensor + %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor) -> tensor + %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%7, %8 : tensor, tensor) outs(%10 : tensor) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %12 = arith.addf %in, %in_0 : f32 + linalg.yield %12 : f32 + } -> tensor + flow.dispatch.tensor.store %11, %6, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%2, %3} + return } // Masking should not happen on aarch64 if there is no SVE support. @@ -136,25 +158,32 @@ module { // CHECK-NOT: vector.maskedload // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> -module { - func.func @mask_matmul_sve() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor - %8 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor>{%2, %1} -> tensor - %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %10 = linalg.matmul ins(%7, %8 : tensor, tensor) outs(%9 : tensor) -> tensor - flow.dispatch.tensor.store %10, %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} - return - } +func.func @mask_matmul_sve() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %2} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %1} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%0, %1} + %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor + %8 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor>{%2, %1} -> tensor + %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %10 = linalg.matmul ins(%7, %8 : tensor, tensor) outs(%9 : tensor) -> tensor + flow.dispatch.tensor.store %10, %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} + return } // Masking is applied to the matmul on aarch64 when SVE is enabled. @@ -163,30 +192,36 @@ module { // CHECK: vector.maskedload // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> #map = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @mask_dynamic_generic_add() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = arith.index_cast %0 : i32 to index - %3 = arith.index_cast %1 : i32 to index - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%2, %3} - %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor - %8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor - %9 = tensor.empty(%2, %3) : tensor - %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor) -> tensor - %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%7, %8 : tensor, tensor) outs(%10 : tensor) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %12 = arith.addf %in, %in_0 : f32 - linalg.yield %12 : f32 - } -> tensor - flow.dispatch.tensor.store %11, %6, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%2, %3} - return - } +func.func @mask_dynamic_generic_add() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = arith.index_cast %0 : i32 to index + %3 = arith.index_cast %1 : i32 to index + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%2, %3} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %3} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%2, %3} + %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor + %8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor + %9 = tensor.empty(%2, %3) : tensor + %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor) -> tensor + %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%7, %8 : tensor, tensor) outs(%10 : tensor) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %12 = arith.addf %in, %in_0 : f32 + linalg.yield %12 : f32 + } -> tensor + flow.dispatch.tensor.store %11, %6, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%2, %3} + return } // Masking is applied to the peeled loop on aarch64 when SVE is enabled. diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_vectorize_nd_extract_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_vectorize_nd_extract_tests.mlir index e25e6c075798e..1e2e60deb2cf8 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_vectorize_nd_extract_tests.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_vectorize_nd_extract_tests.mlir @@ -1,88 +1,92 @@ // RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))' --split-input-file %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_system_elf_riscv_64_ = #hal.executable.target<"llvm-cpu", "system-elf-riscv_64", {cpu = "generic-rv64", cpu_features = "+m,+a,+f,+d,+v", data_layout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128", native_vector_size = 64 : index, target_triple = "riscv64"}> #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #map1 = affine_map<(d0, d1) -> (d0 + d1 * 257)> -module { - func.func @main_dispatch_77_generic_1x257x257x21() attributes {hal.executable.target = #executable_target_system_elf_riscv_64_} { - %c1115136 = arith.constant 1115136 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 2.000000e+00 : f32 - %cst_0 = arith.constant 0.000000e+00 : f32 - %cst_1 = arith.constant 1.600000e+01 : f32 - %c1_i32 = arith.constant 1 : i32 - %c32_i32 = arith.constant 32 : i32 - %cst_2 = arith.constant 1.000000e+00 : f32 - %c0_i32 = arith.constant 0 : i32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c1115136) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 33, 33, 21], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x33x33x21xf32> - %3 = tensor.empty() : tensor<1x257x257x21xf32> - %4 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3 : tensor<1x257x257x21xf32>) { - ^bb0(%out: f32): - %5 = linalg.index 1 : index - %6 = linalg.index 0 : index - %7 = affine.apply #map1(%5, %6) - %8 = linalg.index 2 : index - %9 = linalg.index 3 : index - %10 = arith.index_cast %7 : index to i32 - %11 = arith.index_cast %8 : index to i32 - %12 = arith.uitofp %10 : i32 to f32 - %13 = arith.mulf %12, %cst : f32 - %14 = arith.addf %13, %cst_0 : f32 - %15 = arith.divf %14, %cst_1 : f32 - %16 = math.floor %15 : f32 - %17 = arith.subf %15, %16 : f32 - %18 = arith.fptosi %16 : f32 to i32 - %19 = arith.uitofp %11 : i32 to f32 - %20 = arith.mulf %19, %cst : f32 - %21 = arith.addf %20, %cst_0 : f32 - %22 = arith.divf %21, %cst_1 : f32 - %23 = math.floor %22 : f32 - %24 = arith.subf %22, %23 : f32 - %25 = arith.fptosi %23 : f32 to i32 - %26 = arith.addi %18, %c1_i32 : i32 - %27 = arith.cmpi slt, %18, %c0_i32 : i32 - %28 = arith.select %27, %c0_i32, %18 : i32 - %29 = arith.cmpi sgt, %18, %c32_i32 : i32 - %30 = arith.select %29, %c32_i32, %28 : i32 - %31 = arith.cmpi slt, %26, %c0_i32 : i32 - %32 = arith.select %31, %c0_i32, %26 : i32 - %33 = arith.cmpi sgt, %26, %c32_i32 : i32 - %34 = arith.select %33, %c32_i32, %32 : i32 - %35 = arith.index_cast %30 : i32 to index - %36 = arith.index_cast %34 : i32 to index - %37 = arith.addi %25, %c1_i32 : i32 - %38 = arith.cmpi slt, %25, %c0_i32 : i32 - %39 = arith.select %38, %c0_i32, %25 : i32 - %40 = arith.cmpi sgt, %25, %c32_i32 : i32 - %41 = arith.select %40, %c32_i32, %39 : i32 - %42 = arith.cmpi slt, %37, %c0_i32 : i32 - %43 = arith.select %42, %c0_i32, %37 : i32 - %44 = arith.cmpi sgt, %37, %c32_i32 : i32 - %45 = arith.select %44, %c32_i32, %43 : i32 - %46 = arith.index_cast %41 : i32 to index - %47 = arith.index_cast %45 : i32 to index - %extracted = tensor.extract %2[%c0, %35, %46, %9] : tensor<1x33x33x21xf32> - %extracted_3 = tensor.extract %2[%c0, %35, %47, %9] : tensor<1x33x33x21xf32> - %extracted_4 = tensor.extract %2[%c0, %36, %46, %9] : tensor<1x33x33x21xf32> - %extracted_5 = tensor.extract %2[%c0, %36, %47, %9] : tensor<1x33x33x21xf32> - %48 = arith.subf %cst_2, %24 : f32 - %49 = arith.mulf %extracted, %48 : f32 - %50 = arith.mulf %extracted_3, %24 : f32 - %51 = arith.addf %49, %50 : f32 - %52 = arith.mulf %extracted_4, %48 : f32 - %53 = arith.mulf %extracted_5, %24 : f32 - %54 = arith.addf %52, %53 : f32 - %55 = arith.subf %cst_2, %17 : f32 - %56 = arith.mulf %51, %55 : f32 - %57 = arith.mulf %54, %17 : f32 - %58 = arith.addf %56, %57 : f32 - linalg.yield %58 : f32 - } -> tensor<1x257x257x21xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [1, 257, 257, 21], strides = [1, 1, 1, 1] : tensor<1x257x257x21xf32> -> !flow.dispatch.tensor> - return - } +func.func @main_dispatch_77_generic_1x257x257x21() attributes {hal.executable.target = #executable_target_system_elf_riscv_64_} { + %c1115136 = arith.constant 1115136 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 2.000000e+00 : f32 + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant 1.600000e+01 : f32 + %c1_i32 = arith.constant 1 : i32 + %c32_i32 = arith.constant 32 : i32 + %cst_2 = arith.constant 1.000000e+00 : f32 + %c0_i32 = arith.constant 0 : i32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c1115136) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 33, 33, 21], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x33x33x21xf32> + %3 = tensor.empty() : tensor<1x257x257x21xf32> + %4 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%3 : tensor<1x257x257x21xf32>) { + ^bb0(%out: f32): + %5 = linalg.index 1 : index + %6 = linalg.index 0 : index + %7 = affine.apply #map1(%5, %6) + %8 = linalg.index 2 : index + %9 = linalg.index 3 : index + %10 = arith.index_cast %7 : index to i32 + %11 = arith.index_cast %8 : index to i32 + %12 = arith.uitofp %10 : i32 to f32 + %13 = arith.mulf %12, %cst : f32 + %14 = arith.addf %13, %cst_0 : f32 + %15 = arith.divf %14, %cst_1 : f32 + %16 = math.floor %15 : f32 + %17 = arith.subf %15, %16 : f32 + %18 = arith.fptosi %16 : f32 to i32 + %19 = arith.uitofp %11 : i32 to f32 + %20 = arith.mulf %19, %cst : f32 + %21 = arith.addf %20, %cst_0 : f32 + %22 = arith.divf %21, %cst_1 : f32 + %23 = math.floor %22 : f32 + %24 = arith.subf %22, %23 : f32 + %25 = arith.fptosi %23 : f32 to i32 + %26 = arith.addi %18, %c1_i32 : i32 + %27 = arith.cmpi slt, %18, %c0_i32 : i32 + %28 = arith.select %27, %c0_i32, %18 : i32 + %29 = arith.cmpi sgt, %18, %c32_i32 : i32 + %30 = arith.select %29, %c32_i32, %28 : i32 + %31 = arith.cmpi slt, %26, %c0_i32 : i32 + %32 = arith.select %31, %c0_i32, %26 : i32 + %33 = arith.cmpi sgt, %26, %c32_i32 : i32 + %34 = arith.select %33, %c32_i32, %32 : i32 + %35 = arith.index_cast %30 : i32 to index + %36 = arith.index_cast %34 : i32 to index + %37 = arith.addi %25, %c1_i32 : i32 + %38 = arith.cmpi slt, %25, %c0_i32 : i32 + %39 = arith.select %38, %c0_i32, %25 : i32 + %40 = arith.cmpi sgt, %25, %c32_i32 : i32 + %41 = arith.select %40, %c32_i32, %39 : i32 + %42 = arith.cmpi slt, %37, %c0_i32 : i32 + %43 = arith.select %42, %c0_i32, %37 : i32 + %44 = arith.cmpi sgt, %37, %c32_i32 : i32 + %45 = arith.select %44, %c32_i32, %43 : i32 + %46 = arith.index_cast %41 : i32 to index + %47 = arith.index_cast %45 : i32 to index + %extracted = tensor.extract %2[%c0, %35, %46, %9] : tensor<1x33x33x21xf32> + %extracted_3 = tensor.extract %2[%c0, %35, %47, %9] : tensor<1x33x33x21xf32> + %extracted_4 = tensor.extract %2[%c0, %36, %46, %9] : tensor<1x33x33x21xf32> + %extracted_5 = tensor.extract %2[%c0, %36, %47, %9] : tensor<1x33x33x21xf32> + %48 = arith.subf %cst_2, %24 : f32 + %49 = arith.mulf %extracted, %48 : f32 + %50 = arith.mulf %extracted_3, %24 : f32 + %51 = arith.addf %49, %50 : f32 + %52 = arith.mulf %extracted_4, %48 : f32 + %53 = arith.mulf %extracted_5, %24 : f32 + %54 = arith.addf %52, %53 : f32 + %55 = arith.subf %cst_2, %17 : f32 + %56 = arith.mulf %51, %55 : f32 + %57 = arith.mulf %54, %17 : f32 + %58 = arith.addf %56, %57 : f32 + linalg.yield %58 : f32 + } -> tensor<1x257x257x21xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [1, 257, 257, 21], strides = [1, 1, 1, 1] : tensor<1x257x257x21xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @main_dispatch_77_generic_1x257x257x21 diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_lowering_strategy.mlir index 841615bd289c0..410d80149ecb7 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_lowering_strategy.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_lowering_strategy.mlir @@ -1,24 +1,30 @@ // RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy)' --split-input-file %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> -module { - func.func @matmul_tensors_default() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor - %8 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor>{%2, %1} -> tensor - %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %10 = linalg.matmul ins(%7, %8 : tensor, tensor) outs(%9 : tensor) -> tensor - flow.dispatch.tensor.store %10, %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} - return - } +func.func @matmul_tensors_default() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %2} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %1} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%0, %1} + %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor + %8 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor>{%2, %1} -> tensor + %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %10 = linalg.matmul ins(%7, %8 : tensor, tensor) outs(%9 : tensor) -> tensor + flow.dispatch.tensor.store %10, %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -29,25 +35,31 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> -module { - func.func @i4_i4_i32_matmul() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor - %8 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor>{%2, %1} -> tensor - %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %10 = linalg.matmul ins(%7, %8 : tensor, tensor) outs(%9 : tensor) -> tensor - flow.dispatch.tensor.store %10, %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} - return - } +func.func @i4_i4_i32_matmul() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %2} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %1} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%0, %1} + %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor + %8 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor>{%2, %1} -> tensor + %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %10 = linalg.matmul ins(%7, %8 : tensor, tensor) outs(%9 : tensor) -> tensor + flow.dispatch.tensor.store %10, %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -59,25 +71,30 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> -module { - func.func @batch_matmul_tensors() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.constant.load[3] : index - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) : !flow.dispatch.tensor>{%0, %1, %3} - %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) : !flow.dispatch.tensor>{%0, %3, %2} - %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) : !flow.dispatch.tensor>{%0, %1, %2} - %7 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0], sizes = [%0, %1, %3], strides = [1, 1, 1] : !flow.dispatch.tensor>{%0, %1, %3} -> tensor - %8 = flow.dispatch.tensor.load %5, offsets = [0, 0, 0], sizes = [%0, %3, %2], strides = [1, 1, 1] : !flow.dispatch.tensor>{%0, %3, %2} -> tensor - %9 = tensor.empty(%0, %1, %2) : tensor - %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor) -> tensor - %11 = linalg.batch_matmul ins(%7, %8 : tensor, tensor) outs(%10 : tensor) -> tensor - flow.dispatch.tensor.store %11, %6, offsets = [0, 0, 0], sizes = [%0, %1, %2], strides = [1, 1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1, %2} - return - } +func.func @batch_matmul_tensors() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) : !flow.dispatch.tensor>{%0, %1, %3} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) : !flow.dispatch.tensor>{%0, %3, %2} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(32) : !flow.dispatch.tensor>{%0, %1, %2} + %7 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0], sizes = [%0, %1, %3], strides = [1, 1, 1] : !flow.dispatch.tensor>{%0, %1, %3} -> tensor + %8 = flow.dispatch.tensor.load %5, offsets = [0, 0, 0], sizes = [%0, %3, %2], strides = [1, 1, 1] : !flow.dispatch.tensor>{%0, %3, %2} -> tensor + %9 = tensor.empty(%0, %1, %2) : tensor + %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor) -> tensor + %11 = linalg.batch_matmul ins(%7, %8 : tensor, tensor) outs(%10 : tensor) -> tensor + flow.dispatch.tensor.store %11, %6, offsets = [0, 0, 0], sizes = [%0, %1, %2], strides = [1, 1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1, %2} + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -88,21 +105,26 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android30"}> -module { - func.func @matmul_static() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [196, 240], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<196x240xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [240, 40], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<240x40xf32> - %5 = tensor.empty() : tensor<196x40xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<196x40xf32>) -> tensor<196x40xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<196x240xf32>, tensor<240x40xf32>) outs(%6 : tensor<196x40xf32>) -> tensor<196x40xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [196, 40], strides = [1, 1] : tensor<196x40xf32> -> !flow.dispatch.tensor> - return - } +func.func @matmul_static() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [196, 240], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<196x240xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [240, 40], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<240x40xf32> + %5 = tensor.empty() : tensor<196x40xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<196x40xf32>) -> tensor<196x40xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<196x240xf32>, tensor<240x40xf32>) outs(%6 : tensor<196x40xf32>) -> tensor<196x40xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [196, 40], strides = [1, 1] : tensor<196x40xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -113,23 +135,28 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android30"}> -module { - func.func @conv_static() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %c607520 = arith.constant 607520 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c607520) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 51, 41, 512], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x51x41x512xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 512, 512], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x512x512xf32> - %5 = tensor.empty() : tensor<1x25x20x512xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x25x20x512xf32>) -> tensor<1x25x20x512xf32> - %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x51x41x512xf32>, tensor<3x3x512x512xf32>) outs(%6 : tensor<1x25x20x512xf32>) -> tensor<1x25x20x512xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 25, 20, 512], strides = [1, 1, 1, 1] : tensor<1x25x20x512xf32> -> !flow.dispatch.tensor> - return - } +func.func @conv_static() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %c607520 = arith.constant 607520 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) offset(%c607520) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 51, 41, 512], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x51x41x512xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 512, 512], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x512x512xf32> + %5 = tensor.empty() : tensor<1x25x20x512xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x25x20x512xf32>) -> tensor<1x25x20x512xf32> + %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x51x41x512xf32>, tensor<3x3x512x512xf32>) outs(%6 : tensor<1x25x20x512xf32>) -> tensor<1x25x20x512xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 25, 20, 512], strides = [1, 1, 1, 1] : tensor<1x25x20x512xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -139,21 +166,26 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android30"}> -module { - func.func @restrict_num_workgroups() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 11, 11, 576], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x11x11x576xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [5, 5, 576], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<5x5x576xf32> - %5 = tensor.empty() : tensor<1x7x7x576xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x7x7x576xf32>) -> tensor<1x7x7x576xf32> - %7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<1x11x11x576xf32>, tensor<5x5x576xf32>) outs(%6 : tensor<1x7x7x576xf32>) -> tensor<1x7x7x576xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 7, 7, 576], strides = [1, 1, 1, 1] : tensor<1x7x7x576xf32> -> !flow.dispatch.tensor> - return - } +func.func @restrict_num_workgroups() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 11, 11, 576], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x11x11x576xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [5, 5, 576], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<5x5x576xf32> + %5 = tensor.empty() : tensor<1x7x7x576xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x7x7x576xf32>) -> tensor<1x7x7x576xf32> + %7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<1x11x11x576xf32>, tensor<5x5x576xf32>) outs(%6 : tensor<1x7x7x576xf32>) -> tensor<1x7x7x576xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 7, 7, 576], strides = [1, 1, 1, 1] : tensor<1x7x7x576xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -162,25 +194,29 @@ module { // CHECK: linalg.depthwise_conv_2d_nhwc_hwc // CHECK-SAME: lowering_config = #[[CONFIG]] - // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android30"}> -module { - func.func @matmul_aarch_i8_i8_i32_static() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} { - %c0_i32 = arith.constant 0 : i32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x384xi8> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [384, 1536], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<384x1536xi8> - %5 = tensor.empty() : tensor<128x1536xi32> - %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<128x1536xi32>) -> tensor<128x1536xi32> - %7 = linalg.matmul ins(%3, %4 : tensor<128x384xi8>, tensor<384x1536xi8>) outs(%6 : tensor<128x1536xi32>) -> tensor<128x1536xi32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1536], strides = [1, 1] : tensor<128x1536xi32> -> !flow.dispatch.tensor> - return - } +func.func @matmul_aarch_i8_i8_i32_static() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x384xi8> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [384, 1536], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<384x1536xi8> + %5 = tensor.empty() : tensor<128x1536xi32> + %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<128x1536xi32>) -> tensor<128x1536xi32> + %7 = linalg.matmul ins(%3, %4 : tensor<128x384xi8>, tensor<384x1536xi8>) outs(%6 : tensor<128x1536xi32>) -> tensor<128x1536xi32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1536], strides = [1, 1] : tensor<128x1536xi32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -191,23 +227,28 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android30"}> -module { - func.func @matmul_aarch_i8_i8_i32_dynamic() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0, %1} - %6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor - %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor>{%2, %1} -> tensor - %8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %9 = linalg.matmul ins(%6, %7 : tensor, tensor) outs(%8 : tensor) -> tensor - flow.dispatch.tensor.store %9, %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} - return - } +func.func @matmul_aarch_i8_i8_i32_dynamic() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0, %2} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%2, %1} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0, %1} + %6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor + %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor>{%2, %1} -> tensor + %8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %9 = linalg.matmul ins(%6, %7 : tensor, tensor) outs(%8 : tensor) -> tensor + flow.dispatch.tensor.store %9, %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -218,19 +259,23 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android30"}> -module { - func.func @pack() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [20, 40], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<20x40xf32> - %3 = tensor.empty() : tensor<4x48x8x1xf32> - %pack = tensor.pack %2 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %3 : tensor<20x40xf32> -> tensor<4x48x8x1xf32> - flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [4, 48, 8, 1], strides = [1, 1, 1, 1] : tensor<4x48x8x1xf32> -> !flow.dispatch.tensor> - return - } +func.func @pack() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [20, 40], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<20x40xf32> + %3 = tensor.empty() : tensor<4x48x8x1xf32> + %pack = tensor.pack %2 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %3 : tensor<20x40xf32> -> tensor<4x48x8x1xf32> + flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [4, 48, 8, 1], strides = [1, 1, 1, 1] : tensor<4x48x8x1xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -241,27 +286,31 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android30"}> -module { - func.func @unpack_outer_dynamic() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} { - %c131072 = arith.constant 131072 : index - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = arith.index_castui %0 : i32 to index - %5 = arith.index_castui %1 : i32 to index - %6 = arith.index_castui %2 : i32 to index - %7 = arith.index_castui %3 : i32 to index - %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%4, %5} - %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c131072) : !flow.dispatch.tensor>{%6, %7} - %10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 32, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %5} -> tensor - %11 = tensor.empty(%6, %7) : tensor - %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %11 : tensor -> tensor - flow.dispatch.tensor.store %unpack, %9, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%6, %7} - return - } +func.func @unpack_outer_dynamic() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} { + %c131072 = arith.constant 131072 : index + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 + %4 = arith.index_castui %0 : i32 to index + %5 = arith.index_castui %1 : i32 to index + %6 = arith.index_castui %2 : i32 to index + %7 = arith.index_castui %3 : i32 to index + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%4, %5} + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c131072) : !flow.dispatch.tensor>{%6, %7} + %10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 32, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %5} -> tensor + %11 = tensor.empty(%6, %7) : tensor + %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %11 : tensor -> tensor + flow.dispatch.tensor.store %unpack, %9, offsets = [0, 0], sizes = [%6, %7], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%6, %7} + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -272,22 +321,27 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> -module { - func.func @mmt4d_384x384x512_4x1x4_dispatch_0() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { - %c0 = arith.constant 0 : index - %c96 = arith.constant 96 : index - %c128 = arith.constant 128 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [96, 384, 4, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<96x384x4x1xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [128, 384, 4, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x384x4x1xf32> - %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [96, 384, 4, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<96x128x4x4xf32> - %6 = linalg.mmt4d ins(%3, %4 : tensor<96x384x4x1xf32>, tensor<128x384x4x1xf32>) outs(%5 : tensor<96x128x4x4xf32>) -> tensor<96x128x4x4xf32> - flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [96, 128, 4, 4], strides = [1, 1, 1, 1] : tensor<96x128x4x4xf32> -> !flow.dispatch.tensor> - return - } +func.func @mmt4d_384x384x512_4x1x4_dispatch_0() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { + %c0 = arith.constant 0 : index + %c96 = arith.constant 96 : index + %c128 = arith.constant 128 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [96, 384, 4, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<96x384x4x1xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [128, 384, 4, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x384x4x1xf32> + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [96, 384, 4, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<96x128x4x4xf32> + %6 = linalg.mmt4d ins(%3, %4 : tensor<96x384x4x1xf32>, tensor<128x384x4x1xf32>) outs(%5 : tensor<96x128x4x4xf32>) -> tensor<96x128x4x4xf32> + flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [96, 128, 4, 4], strides = [1, 1, 1, 1] : tensor<96x128x4x4xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve,+sme", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> func.func @transpose_f32() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x32xf32> %3 = tensor.empty() : tensor<32x32xf32> %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<32x32xf32>) outs(%3 : tensor<32x32xf32>) { @@ -24,11 +30,17 @@ func.func @transpose_f32() attributes {hal.executable.target = #executable_targe // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve,+sme", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> func.func @transpose_f64() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x32xf64> %3 = tensor.empty() : tensor<32x32xf64> %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<32x32xf64>) outs(%3 : tensor<32x32xf64>) { @@ -48,11 +60,17 @@ func.func @transpose_f64() attributes {hal.executable.target = #executable_targe // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve,+sme", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> func.func @transpose_unsupported_not_rank_2() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 4, 8], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x4x8xf32> %3 = tensor.empty() : tensor<2x8x4xf32> %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<2x4x8xf32>) outs(%3 : tensor<2x8x4xf32>) { @@ -72,11 +90,17 @@ func.func @transpose_unsupported_not_rank_2() attributes {hal.executable.target // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve,+sme", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> func.func @transpose_unsupported_not_simple_transpose() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x32xf32> %3 = tensor.empty() : tensor<32x32xf32> %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<32x32xf32>) outs(%3 : tensor<32x32xf32>) { diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy.mlir index 4944337da30c1..139f1967e043c 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy.mlir @@ -3,25 +3,31 @@ // RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy)' \ // RUN: --iree-llvmcpu-enable-scalable-vectorization=true --split-input-file --iree-llvmcpu-disable-arm-sme-tiling %s | FileCheck %s --check-prefixes=CHECK,DISABLE-ARM-SME +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> -module { - func.func @matmul_tensors() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor - %8 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor>{%2, %1} -> tensor - %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %10 = linalg.matmul ins(%7, %8 : tensor, tensor) outs(%9 : tensor) -> tensor - flow.dispatch.tensor.store %10, %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} - return - } +func.func @matmul_tensors() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %2} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %1} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%0, %1} + %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor + %8 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor>{%2, %1} -> tensor + %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %10 = linalg.matmul ins(%7, %8 : tensor, tensor) outs(%9 : tensor) -> tensor + flow.dispatch.tensor.store %10, %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -32,20 +38,26 @@ module { // CHECK-SAME: lowering_config = #[[CONFIG]] // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> -module { - func.func @static_tensors_non_pow_two_sizes() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [15, 14], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<15x14xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [14, 7], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<14x7xf32> - %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [15, 7], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<15x7xf32> - %6 = linalg.matmul ins(%3, %4 : tensor<15x14xf32>, tensor<14x7xf32>) outs(%5 : tensor<15x7xf32>) -> tensor<15x7xf32> - flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [15, 7], strides = [1, 1] : tensor<15x7xf32> -> !flow.dispatch.tensor> - return - } +func.func @static_tensors_non_pow_two_sizes() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [15, 14], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<15x14xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [14, 7], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<14x7xf32> + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [15, 7], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<15x7xf32> + %6 = linalg.matmul ins(%3, %4 : tensor<15x14xf32>, tensor<14x7xf32>) outs(%5 : tensor<15x7xf32>) -> tensor<15x7xf32> + flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [15, 7], strides = [1, 1] : tensor<15x7xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -56,20 +68,26 @@ module { // CHECK-SAME: lowering_config = #[[CONFIG]] // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> -module { - func.func @static_tensors_1x1() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1xf32> - %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1xf32> - %6 = linalg.matmul ins(%3, %4 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) -> tensor<1x1xf32> - flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor> - return - } +func.func @static_tensors_1x1() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1xf32> + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1xf32> + %6 = linalg.matmul ins(%3, %4 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) -> tensor<1x1xf32> + flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -80,25 +98,32 @@ module { // CHECK-SAME: lowering_config = #[[CONFIG]] // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve,+sme", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> -module { - func.func @matmul_tensors() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor - %8 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor>{%2, %1} -> tensor - %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %10 = linalg.matmul ins(%7, %8 : tensor, tensor) outs(%9 : tensor) -> tensor - flow.dispatch.tensor.store %10, %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} - return - } +func.func @matmul_tensors() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %2} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %1} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%0, %1} + %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor + %8 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor>{%2, %1} -> tensor + %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %10 = linalg.matmul ins(%7, %8 : tensor, tensor) outs(%9 : tensor) -> tensor + flow.dispatch.tensor.store %10, %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} + return } // DISABLE-ARM-SME-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -116,46 +141,52 @@ module { // WITH-SME-SAME: lowering_config = #[[CONFIG]] // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {cpu = "", cpu_features = "+v9a,+sve", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", link_embedded = false, native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android34"}> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> #map2 = affine_map<(d0, d1) -> (d1)> -module { - func.func @matmul_with_fill() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} { - %c0_i32 = arith.constant 0 : i32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = hal.interface.constant.load[4] : i32 - %5 = arith.index_castui %0 : i32 to index - %6 = arith.index_castui %1 : i32 to index - %7 = arith.index_castui %2 : i32 to index - %8 = arith.index_castui %3 : i32 to index - %9 = arith.index_castui %4 : i32 to index - %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor> - %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%7) flags(ReadOnly) : !flow.dispatch.tensor> - %12 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor> - %13 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor> - %14 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%9) : !flow.dispatch.tensor> - %15 = flow.dispatch.tensor.load %10, offsets = [0, 0], sizes = [1024, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x256xi8> - %16 = flow.dispatch.tensor.load %11, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xi8> - %17 = flow.dispatch.tensor.load %12, offsets = [0], sizes = [1024], strides = [1] : !flow.dispatch.tensor> -> tensor<1024xf32> - %18 = flow.dispatch.tensor.load %13, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor> -> tensor<256xf32> - %19 = tensor.empty() : tensor<1024x256xf32> - %20 = tensor.empty() : tensor<1024x256xi32> - %21 = linalg.fill ins(%c0_i32 : i32) outs(%20 : tensor<1024x256xi32>) -> tensor<1024x256xi32> - %22 = linalg.matmul ins(%15, %16 : tensor<1024x256xi8>, tensor<256x256xi8>) outs(%21 : tensor<1024x256xi32>) -> tensor<1024x256xi32> - %23 = linalg.generic {indexing_maps = [#map, #map1, #map2, #map], iterator_types = ["parallel", "parallel"]} ins(%22, %17, %18 : tensor<1024x256xi32>, tensor<1024xf32>, tensor<256xf32>) outs(%19 : tensor<1024x256xf32>) { - ^bb0(%in: i32, %in_0: f32, %in_1: f32, %out: f32): - %24 = arith.sitofp %in : i32 to f32 - %25 = arith.mulf %24, %in_0 : f32 - %26 = arith.mulf %25, %in_1 : f32 - linalg.yield %26 : f32 - } -> tensor<1024x256xf32> - flow.dispatch.tensor.store %23, %14, offsets = [0, 0], sizes = [1024, 256], strides = [1, 1] : tensor<1024x256xf32> -> !flow.dispatch.tensor> - return - } +func.func @matmul_with_fill() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} { + %c0_i32 = arith.constant 0 : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 + %5 = arith.index_castui %0 : i32 to index + %6 = arith.index_castui %1 : i32 to index + %7 = arith.index_castui %2 : i32 to index + %8 = arith.index_castui %3 : i32 to index + %9 = arith.index_castui %4 : i32 to index + %10 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor> + %11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%7) flags(ReadOnly) : !flow.dispatch.tensor> + %12 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor> + %13 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor> + %14 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%9) : !flow.dispatch.tensor> + %15 = flow.dispatch.tensor.load %10, offsets = [0, 0], sizes = [1024, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x256xi8> + %16 = flow.dispatch.tensor.load %11, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xi8> + %17 = flow.dispatch.tensor.load %12, offsets = [0], sizes = [1024], strides = [1] : !flow.dispatch.tensor> -> tensor<1024xf32> + %18 = flow.dispatch.tensor.load %13, offsets = [0], sizes = [256], strides = [1] : !flow.dispatch.tensor> -> tensor<256xf32> + %19 = tensor.empty() : tensor<1024x256xf32> + %20 = tensor.empty() : tensor<1024x256xi32> + %21 = linalg.fill ins(%c0_i32 : i32) outs(%20 : tensor<1024x256xi32>) -> tensor<1024x256xi32> + %22 = linalg.matmul ins(%15, %16 : tensor<1024x256xi8>, tensor<256x256xi8>) outs(%21 : tensor<1024x256xi32>) -> tensor<1024x256xi32> + %23 = linalg.generic {indexing_maps = [#map, #map1, #map2, #map], iterator_types = ["parallel", "parallel"]} ins(%22, %17, %18 : tensor<1024x256xi32>, tensor<1024xf32>, tensor<256xf32>) outs(%19 : tensor<1024x256xf32>) { + ^bb0(%in: i32, %in_0: f32, %in_1: f32, %out: f32): + %24 = arith.sitofp %in : i32 to f32 + %25 = arith.mulf %24, %in_0 : f32 + %26 = arith.mulf %25, %in_1 : f32 + linalg.yield %26 : f32 + } -> tensor<1024x256xf32> + flow.dispatch.tensor.store %23, %14, offsets = [0, 0], sizes = [1024, 256], strides = [1, 1] : tensor<1024x256xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config @@ -170,21 +201,26 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {cpu = "", cpu_features = "+v9a,+sve", data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", link_embedded = false, native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android34"}> -module { - func.func @depthwise_conv() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 161, 161, 240], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x57x57x72xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 240], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x72xf32> - %5 = tensor.empty() : tensor<1x28x28x72xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x28x28x72xf32>) -> tensor<1x28x28x72xf32> - %7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x57x57x72xf32>, tensor<3x3x72xf32>) outs(%6 : tensor<1x28x28x72xf32>) -> tensor<1x28x28x72xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 72], strides = [1, 1, 1, 1] : tensor<1x28x28x72xf32> -> !flow.dispatch.tensor> - return - } +func.func @depthwise_conv() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 161, 161, 240], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x57x57x72xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 240], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x72xf32> + %5 = tensor.empty() : tensor<1x28x28x72xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x28x28x72xf32>) -> tensor<1x28x28x72xf32> + %7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x57x57x72xf32>, tensor<3x3x72xf32>) outs(%6 : tensor<1x28x28x72xf32>) -> tensor<1x28x28x72xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 72], strides = [1, 1, 1, 1] : tensor<1x28x28x72xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy_peeling.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy_peeling.mlir index 3aca18d304d15..02ad9c9ab076a 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy_peeling.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy_peeling.mlir @@ -2,25 +2,31 @@ // RUN: --iree-llvmcpu-enable-scalable-vectorization=true --iree-llvmcpu-vector-pproc-strategy=peel \ // RUN: --split-input-file %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> -module { - func.func @matmul_tensors() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor - %8 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor>{%2, %1} -> tensor - %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %10 = linalg.matmul ins(%7, %8 : tensor, tensor) outs(%9 : tensor) -> tensor - flow.dispatch.tensor.store %10, %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} - return - } +func.func @matmul_tensors() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %2} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %1} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor>{%0, %1} + %7 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor + %8 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor>{%2, %1} -> tensor + %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %10 = linalg.matmul ins(%7, %8 : tensor, tensor) outs(%9 : tensor) -> tensor + flow.dispatch.tensor.store %10, %6, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -31,20 +37,26 @@ module { // CHECK-SAME: lowering_config = #[[CONFIG]] // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> -module { - func.func @static_tensors_non_pow_two_sizes() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [15, 14], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<15x14xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [14, 7], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<14x7xf32> - %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [15, 7], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<15x7xf32> - %6 = linalg.matmul ins(%3, %4 : tensor<15x14xf32>, tensor<14x7xf32>) outs(%5 : tensor<15x7xf32>) -> tensor<15x7xf32> - flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [15, 7], strides = [1, 1] : tensor<15x7xf32> -> !flow.dispatch.tensor> - return - } +func.func @static_tensors_non_pow_two_sizes() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [15, 14], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<15x14xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [14, 7], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<14x7xf32> + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [15, 7], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<15x7xf32> + %6 = linalg.matmul ins(%3, %4 : tensor<15x14xf32>, tensor<14x7xf32>) outs(%5 : tensor<15x7xf32>) -> tensor<15x7xf32> + flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [15, 7], strides = [1, 1] : tensor<15x7xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -55,20 +67,26 @@ module { // CHECK-SAME: lowering_config = #[[CONFIG]] // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> -module { - func.func @static_tensors_1x1() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1xf32> - %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1xf32> - %6 = linalg.matmul ins(%3, %4 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) -> tensor<1x1xf32> - flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor> - return - } +func.func @static_tensors_1x1() attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1xf32> + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1xf32> + %6 = linalg.matmul ins(%3, %4 : tensor<1x1xf32>, tensor<1x1xf32>) outs(%5 : tensor<1x1xf32>) -> tensor<1x1xf32> + flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor> + return } // TODO: FIXME - scalable "16" ([16]) for just 1 element @@ -81,22 +99,27 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android30"}> -module { - func.func @depthwise_conv() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %input = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 1, 4, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x1x4x4xf32> - %filter = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [1, 4, 4], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x4x4xf32> - %5 = tensor.empty() : tensor<1x1x1x4xf32> - %output = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x1x1x4xf32>) -> tensor<1x1x1x4xf32> - %7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, - strides = dense<1> : tensor<2xi64>} ins(%input, %filter : tensor<1x1x4x4xf32>, tensor<1x4x4xf32>) outs(%output : tensor<1x1x1x4xf32>) -> tensor<1x1x1x4xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1] : tensor<1x1x1x4xf32> -> !flow.dispatch.tensor> - return - } +func.func @depthwise_conv() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %input = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 1, 4, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x1x4x4xf32> + %filter = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [1, 4, 4], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x4x4xf32> + %5 = tensor.empty() : tensor<1x1x1x4xf32> + %output = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x1x1x4xf32>) -> tensor<1x1x1x4xf32> + %7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, + strides = dense<1> : tensor<2xi64>} ins(%input, %filter : tensor<1x1x4x4xf32>, tensor<1x4x4xf32>) outs(%output : tensor<1x1x1x4xf32>) -> tensor<1x1x1x4xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 4], strides = [1, 1, 1, 1] : tensor<1x1x1x4xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_lowering_strategy_without_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_lowering_strategy_without_distribution.mlir index f978d3332f500..fe4c802cbb3a0 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_lowering_strategy_without_distribution.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_lowering_strategy_without_distribution.mlir @@ -1,23 +1,27 @@ // RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy)' --iree-llvmcpu-disable-distribution --split-input-file %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @matmul_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<384x512xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x128xf32> - %5 = tensor.empty() : tensor<384x128xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<384x128xf32>) -> tensor<384x128xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<384x512xf32>, tensor<512x128xf32>) outs(%6 : tensor<384x128xf32>) -> tensor<384x128xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : tensor<384x128xf32> -> !flow.dispatch.tensor> - return - } +func.func @matmul_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<384x512xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x128xf32> + %5 = tensor.empty() : tensor<384x128xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<384x128xf32>) -> tensor<384x128xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<384x512xf32>, tensor<512x128xf32>) outs(%6 : tensor<384x128xf32>) -> tensor<384x128xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : tensor<384x128xf32> -> !flow.dispatch.tensor> + return } - // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_riscv_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_riscv_lowering_strategy.mlir index 0b1eceeb508bd..07769efdd97fa 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_riscv_lowering_strategy.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_riscv_lowering_strategy.mlir @@ -1,20 +1,25 @@ // RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy)' --split-input-file %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_riscv_32_ = #hal.executable.target<"llvm-cpu", "embedded-elf-riscv_32", {cpu_features = "+m,+f", data_layout = "e-m:e-p:32:32-i64:64-n32-S128", native_vector_size = 16 : index, target_triple = "riscv32-none-elf"}> -module { - func.func @matmul_riscv() attributes {hal.executable.target = #executable_target_embedded_elf_riscv_32_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<384x512xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x128xf32> - %5 = tensor.empty() : tensor<384x128xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<384x128xf32>) -> tensor<384x128xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<384x512xf32>, tensor<512x128xf32>) outs(%6 : tensor<384x128xf32>) -> tensor<384x128xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : tensor<384x128xf32> -> !flow.dispatch.tensor> - return - } +func.func @matmul_riscv() attributes {hal.executable.target = #executable_target_embedded_elf_riscv_32_} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<384x512xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x128xf32> + %5 = tensor.empty() : tensor<384x128xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<384x128xf32>) -> tensor<384x128xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<384x512xf32>, tensor<512x128xf32>) outs(%6 : tensor<384x128xf32>) -> tensor<384x128xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : tensor<384x128xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -26,21 +31,27 @@ module { // CHECK-SAME: lowering_config = #[[CONFIG2]] // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_riscv_32_ = #hal.executable.target<"llvm-cpu", "embedded-elf-riscv_32", {cpu_features = "+m,+f", data_layout = "e-m:e-p:32:32-i64:64-n32-S128", native_vector_size = 16 : index, target_triple = "riscv32-none-elf"}> -module { - func.func @thin_depthwise_conv_static() attributes {hal.executable.target = #executable_target_embedded_elf_riscv_32_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 161, 161, 240], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x57x57x72xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 240], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x72xf32> - %5 = tensor.empty() : tensor<1x28x28x72xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x28x28x72xf32>) -> tensor<1x28x28x72xf32> - %7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x57x57x72xf32>, tensor<3x3x72xf32>) outs(%6 : tensor<1x28x28x72xf32>) -> tensor<1x28x28x72xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 72], strides = [1, 1, 1, 1] : tensor<1x28x28x72xf32> -> !flow.dispatch.tensor> - return - } +func.func @thin_depthwise_conv_static() attributes {hal.executable.target = #executable_target_embedded_elf_riscv_32_} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 161, 161, 240], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x57x57x72xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 240], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x72xf32> + %5 = tensor.empty() : tensor<1x28x28x72xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x28x28x72xf32>) -> tensor<1x28x28x72xf32> + %7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x57x57x72xf32>, tensor<3x3x72xf32>) outs(%6 : tensor<1x28x28x72xf32>) -> tensor<1x28x28x72xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 72], strides = [1, 1, 1, 1] : tensor<1x28x28x72xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir index ed06e0bfdb3ce..15183ef2a2f25 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir @@ -1,21 +1,26 @@ // RUN: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy)' --split-input-file %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> -module { - func.func @matvec_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x384xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [384], strides = [1] : !flow.dispatch.tensor> -> tensor<384xf32> - %5 = tensor.empty() : tensor<128xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128xf32>) -> tensor<128xf32> - %7 = linalg.matvec ins(%3, %4 : tensor<128x384xf32>, tensor<384xf32>) outs(%6 : tensor<128xf32>) -> tensor<128xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [128], strides = [1] : tensor<128xf32> -> !flow.dispatch.tensor> - return - } +func.func @matvec_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x384xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [384], strides = [1] : !flow.dispatch.tensor> -> tensor<384xf32> + %5 = tensor.empty() : tensor<128xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128xf32>) -> tensor<128xf32> + %7 = linalg.matvec ins(%3, %4 : tensor<128x384xf32>, tensor<384xf32>) outs(%6 : tensor<128xf32>) -> tensor<128xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0], sizes = [128], strides = [1] : tensor<128xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -27,30 +32,35 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> -module { - func.func @matvec_dynamic() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = arith.index_cast %0 : i32 to index - %4 = arith.index_cast %1 : i32 to index - %5 = arith.index_cast %2 : i32 to index - %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%3, %4} - %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%5} - %8 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%3} - %9 = hal.interface.constant.load[0] : i32 - %10 = arith.index_cast %9 : i32 to index - %11 = flow.dispatch.tensor.load %8, offsets = [0], sizes = [%10], strides = [1] : !flow.dispatch.tensor>{%3} -> tensor - %12 = flow.dispatch.tensor.load %6, offsets = [0, 0], sizes = [%3, %4], strides = [1, 1] : !flow.dispatch.tensor>{%3, %4} -> tensor - %13 = flow.dispatch.tensor.load %7, offsets = [0], sizes = [%5], strides = [1] : !flow.dispatch.tensor>{%5} -> tensor - %14 = linalg.fill ins(%cst : f32) outs(%11 : tensor) -> tensor - %15 = linalg.matvec ins(%12, %13 : tensor, tensor) outs(%14 : tensor) -> tensor - flow.dispatch.tensor.store %15, %8, offsets = [0], sizes = [%3], strides = [1] : tensor -> !flow.dispatch.tensor>{%3} - return - } +func.func @matvec_dynamic() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = arith.index_cast %0 : i32 to index + %4 = arith.index_cast %1 : i32 to index + %5 = arith.index_cast %2 : i32 to index + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%3, %4} + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%5} + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%3} + %9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %10 = arith.index_cast %9 : i32 to index + %11 = flow.dispatch.tensor.load %8, offsets = [0], sizes = [%10], strides = [1] : !flow.dispatch.tensor>{%3} -> tensor + %12 = flow.dispatch.tensor.load %6, offsets = [0, 0], sizes = [%3, %4], strides = [1, 1] : !flow.dispatch.tensor>{%3, %4} -> tensor + %13 = flow.dispatch.tensor.load %7, offsets = [0], sizes = [%5], strides = [1] : !flow.dispatch.tensor>{%5} -> tensor + %14 = linalg.fill ins(%cst : f32) outs(%11 : tensor) -> tensor + %15 = linalg.matvec ins(%12, %13 : tensor, tensor) outs(%14 : tensor) -> tensor + flow.dispatch.tensor.store %15, %8, offsets = [0], sizes = [%3], strides = [1] : tensor -> !flow.dispatch.tensor>{%3} + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -62,22 +72,27 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> -module { - func.func @dot_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [384], strides = [1] : !flow.dispatch.tensor> -> tensor<384xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [384], strides = [1] : !flow.dispatch.tensor> -> tensor<384xf32> - %5 = tensor.empty() : tensor - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor) -> tensor - %7 = linalg.dot ins(%3, %4 : tensor<384xf32>, tensor<384xf32>) outs(%6 : tensor) -> tensor - flow.dispatch.tensor.store %7, %2, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> - return - } +func.func @dot_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [384], strides = [1] : !flow.dispatch.tensor> -> tensor<384xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [384], strides = [1] : !flow.dispatch.tensor> -> tensor<384xf32> + %5 = tensor.empty() : tensor + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor) -> tensor + %7 = linalg.dot ins(%3, %4 : tensor<384xf32>, tensor<384xf32>) outs(%6 : tensor) -> tensor + flow.dispatch.tensor.store %7, %2, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -89,26 +104,31 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> -module { - func.func @dot_dynamic() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = arith.index_cast %0 : i32 to index - %3 = arith.index_cast %1 : i32 to index - %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %5 = flow.dispatch.tensor.load %4, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor - %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%2} - %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%3} - %8 = flow.dispatch.tensor.load %6, offsets = [0], sizes = [%2], strides = [1] : !flow.dispatch.tensor>{%2} -> tensor - %9 = flow.dispatch.tensor.load %7, offsets = [0], sizes = [%3], strides = [1] : !flow.dispatch.tensor>{%3} -> tensor - %10 = linalg.fill ins(%cst : f32) outs(%5 : tensor) -> tensor - %11 = linalg.dot ins(%8, %9 : tensor, tensor) outs(%10 : tensor) -> tensor - flow.dispatch.tensor.store %11, %4, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> - return - } +func.func @dot_dynamic() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = arith.index_cast %0 : i32 to index + %3 = arith.index_cast %1 : i32 to index + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %5 = flow.dispatch.tensor.load %4, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%2} + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%3} + %8 = flow.dispatch.tensor.load %6, offsets = [0], sizes = [%2], strides = [1] : !flow.dispatch.tensor>{%2} -> tensor + %9 = flow.dispatch.tensor.load %7, offsets = [0], sizes = [%3], strides = [1] : !flow.dispatch.tensor>{%3} -> tensor + %10 = linalg.fill ins(%cst : f32) outs(%5 : tensor) -> tensor + %11 = linalg.dot ins(%8, %9 : tensor, tensor) outs(%10 : tensor) -> tensor + flow.dispatch.tensor.store %11, %4, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -120,28 +140,33 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d1)> -module { - func.func @dynamic_add() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%1} - %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %6 = flow.dispatch.tensor.load %3, offsets = [0], sizes = [%1], strides = [1] : !flow.dispatch.tensor>{%1} -> tensor - %7 = tensor.empty(%0, %1) : tensor - %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %6 : tensor, tensor) outs(%7 : tensor) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %9 = arith.addf %in, %in_0 : f32 - linalg.yield %9 : f32 - } -> tensor - flow.dispatch.tensor.store %8, %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} - return - } +func.func @dynamic_add() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %1} + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%1} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %6 = flow.dispatch.tensor.load %3, offsets = [0], sizes = [%1], strides = [1] : !flow.dispatch.tensor>{%1} -> tensor + %7 = tensor.empty(%0, %1) : tensor + %8 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %6 : tensor, tensor) outs(%7 : tensor) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %9 = arith.addf %in, %in_0 : f32 + linalg.yield %9 : f32 + } -> tensor + flow.dispatch.tensor.store %8, %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -152,28 +177,33 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -module { - func.func @add4D() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.constant.load[3] : index - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) : !flow.dispatch.tensor>{%0, %1, %2, %3} - %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) : !flow.dispatch.tensor>{%0, %1, %2, %3} - %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) : !flow.dispatch.tensor>{%0, %1, %2, %3} - %7 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, %2, %3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%0, %1, %2, %3} -> tensor - %8 = flow.dispatch.tensor.load %5, offsets = [0, 0, 0, 0], sizes = [%0, %1, %2, %3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%0, %1, %2, %3} -> tensor - %9 = tensor.empty(%0, %1, %2, %3) : tensor - %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7, %8 : tensor, tensor) outs(%9 : tensor) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %11 = arith.addf %in, %in_0 : f32 - linalg.yield %11 : f32 - } -> tensor - flow.dispatch.tensor.store %10, %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, %2, %3], strides = [1, 1, 1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1, %2, %3} - return - } +func.func @add4D() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) : !flow.dispatch.tensor>{%0, %1, %2, %3} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) : !flow.dispatch.tensor>{%0, %1, %2, %3} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(32) : !flow.dispatch.tensor>{%0, %1, %2, %3} + %7 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0], sizes = [%0, %1, %2, %3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%0, %1, %2, %3} -> tensor + %8 = flow.dispatch.tensor.load %5, offsets = [0, 0, 0, 0], sizes = [%0, %1, %2, %3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%0, %1, %2, %3} -> tensor + %9 = tensor.empty(%0, %1, %2, %3) : tensor + %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7, %8 : tensor, tensor) outs(%9 : tensor) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %11 = arith.addf %in, %in_0 : f32 + linalg.yield %11 : f32 + } -> tensor + flow.dispatch.tensor.store %10, %6, offsets = [0, 0, 0, 0], sizes = [%0, %1, %2, %3], strides = [1, 1, 1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1, %2, %3} + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -185,23 +215,27 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -module { - func.func @add_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [64, 16, 32, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x16x32x128xf32> - %3 = tensor.empty() : tensor<64x16x32x128xf32> - %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<64x16x32x128xf32>) outs(%3 : tensor<64x16x32x128xf32>) { - ^bb0(%in: f32, %out: f32): - %5 = arith.addf %in, %in : f32 - linalg.yield %5 : f32 - } -> tensor<64x16x32x128xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [64, 16, 32, 128], strides = [1, 1, 1, 1] : tensor<64x16x32x128xf32> -> !flow.dispatch.tensor> - return - } +func.func @add_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [64, 16, 32, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x16x32x128xf32> + %3 = tensor.empty() : tensor<64x16x32x128xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<64x16x32x128xf32>) outs(%3 : tensor<64x16x32x128xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = arith.addf %in, %in : f32 + linalg.yield %5 : f32 + } -> tensor<64x16x32x128xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [64, 16, 32, 128], strides = [1, 1, 1, 1] : tensor<64x16x32x128xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -213,26 +247,31 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info #executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> -module { - func.func @preset_config_matmul_tensors() attributes { - hal.executable.target = #executable_target_system_elf_x86_64_, - translation_info = #translation - } { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x512xf32> - %5 = tensor.empty() : tensor<128x512xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x512xf32>) -> tensor<128x512xf32> - %7 = linalg.matmul {lowering_config = #config} ins(%3, %4 : tensor<128x256xf32>, tensor<256x512xf32>) outs(%6 : tensor<128x512xf32>) -> tensor<128x512xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor> - return - } +func.func @preset_config_matmul_tensors() attributes { + hal.executable.target = #executable_target_system_elf_x86_64_, + translation_info = #translation + } { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x512xf32> + %5 = tensor.empty() : tensor<128x512xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x512xf32>) -> tensor<128x512xf32> + %7 = linalg.matmul {lowering_config = #config} ins(%3, %4 : tensor<128x256xf32>, tensor<256x512xf32>) outs(%6 : tensor<128x512xf32>) -> tensor<128x512xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -244,21 +283,26 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> -module { - func.func @matmul_partially_peel() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [16641, 16], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<16641x16xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [16, 8], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<16x8xf32> - %5 = tensor.empty() : tensor<16641x8xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<16641x8xf32>) -> tensor<16641x8xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<16641x16xf32>, tensor<16x8xf32>) outs(%6 : tensor<16641x8xf32>) -> tensor<16641x8xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [16641, 8], strides = [1, 1] : tensor<16641x8xf32> -> !flow.dispatch.tensor> - return - } +func.func @matmul_partially_peel() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [16641, 16], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<16641x16xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [16, 8], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<16x8xf32> + %5 = tensor.empty() : tensor<16641x8xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<16641x8xf32>) -> tensor<16641x8xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<16641x16xf32>, tensor<16x8xf32>) outs(%6 : tensor<16641x8xf32>) -> tensor<16641x8xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [16641, 8], strides = [1, 1] : tensor<16641x8xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -270,25 +314,29 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> #map = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @copy_op_dynamic() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.constant.load[3] : index - %4 = hal.interface.constant.load[4] : index - %5 = hal.interface.constant.load[5] : index - %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref{%0, %1} - %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref{%2, %3} - %subview = memref.subview %7[%4, %5] [%0, %1] [1, 1] : memref to memref> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%6 : memref) outs(%subview : memref>) { - ^bb0(%in: i32, %out: i32): - linalg.yield %in : i32 - } - return +func.func @copy_op_dynamic() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : index + %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : index + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref{%0, %1} + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref{%2, %3} + %subview = memref.subview %7[%4, %5] [%0, %1] [1, 1] : memref to memref> + linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%6 : memref) outs(%subview : memref>) { + ^bb0(%in: i32, %out: i32): + linalg.yield %in : i32 } + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -300,22 +348,26 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> -module { - func.func @static_1d_fft_stage2() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { - %c0 = arith.constant 0 : index - %c2 = arith.constant 2 : index - %cst = arith.constant dense<[1.000000e+00, 6.12323426E-17]> : tensor<2xf32> - %cst_0 = arith.constant dense<[-0.000000e+00, -1.000000e+00]> : tensor<2xf32> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xf32> - %3 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xf32> - %4:2 = iree_linalg_ext.fft ins(%c2, %cst, %cst_0 : index, tensor<2xf32>, tensor<2xf32>) outs(%2, %3 : tensor<32xf32>, tensor<32xf32>) : tensor<32xf32>, tensor<32xf32> - flow.dispatch.tensor.store %4#0, %0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor> - flow.dispatch.tensor.store %4#1, %1, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor> - return - } +func.func @static_1d_fft_stage2() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %cst = arith.constant dense<[1.000000e+00, 6.12323426E-17]> : tensor<2xf32> + %cst_0 = arith.constant dense<[-0.000000e+00, -1.000000e+00]> : tensor<2xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xf32> + %3 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xf32> + %4:2 = iree_linalg_ext.fft ins(%c2, %cst, %cst_0 : index, tensor<2xf32>, tensor<2xf32>) outs(%2, %3 : tensor<32xf32>, tensor<32xf32>) : tensor<32xf32>, tensor<32xf32> + flow.dispatch.tensor.store %4#0, %0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor> + flow.dispatch.tensor.store %4#1, %1, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -327,19 +379,23 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> -module { - func.func @static_3d_fft_stage3() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { - %c3 = arith.constant 3 : index - %cst = arith.constant dense<[1.000000e+00, 0.707106769, 6.12323426E-17, -0.707106769]> : tensor<4xf32> - %cst_0 = arith.constant dense<[-0.000000e+00, -0.707106769, -1.000000e+00, -0.707106769]> : tensor<4xf32> - %0 = bufferization.to_memref %cst_0 : memref<4xf32> - %1 = bufferization.to_memref %cst : memref<4xf32> - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x128x32xf32> - %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<64x128x32xf32> - iree_linalg_ext.fft ins(%c3, %1, %0 : index, memref<4xf32>, memref<4xf32>) outs(%2, %3 : memref<64x128x32xf32>, memref<64x128x32xf32>) - return - } +func.func @static_3d_fft_stage3() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { + %c3 = arith.constant 3 : index + %cst = arith.constant dense<[1.000000e+00, 0.707106769, 6.12323426E-17, -0.707106769]> : tensor<4xf32> + %cst_0 = arith.constant dense<[-0.000000e+00, -0.707106769, -1.000000e+00, -0.707106769]> : tensor<4xf32> + %0 = bufferization.to_memref %cst_0 : memref<4xf32> + %1 = bufferization.to_memref %cst : memref<4xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<64x128x32xf32> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<64x128x32xf32> + iree_linalg_ext.fft ins(%c3, %1, %0 : index, memref<4xf32>, memref<4xf32>) outs(%2, %3 : memref<64x128x32xf32>, memref<64x128x32xf32>) + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -351,36 +407,41 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1, d2) -> (d0, d2)> #map2 = affine_map<(d0, d1, d2) -> (d2, d1)> #map3 = affine_map<(d0, d1, d2) -> (d0, d1)> -module { - func.func @outs_fusion_fn() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %2} - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%2, %1} - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1} - %6 = tensor.empty(%0, %1) : tensor - %7 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel"]} outs(%6 : tensor) { - ^bb0(%out: f32): - linalg.yield %cst : f32 - } -> tensor - %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor - %9 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor>{%2, %1} -> tensor - %10 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"]} ins(%8, %9 : tensor, tensor) outs(%7 : tensor) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %11 = arith.mulf %in, %in_0 : f32 - %12 = arith.addf %11, %out : f32 - linalg.yield %11 : f32 - } -> tensor - flow.dispatch.tensor.store %10, %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} - return - } +func.func @outs_fusion_fn() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %2} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%2, %1} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %1} + %6 = tensor.empty(%0, %1) : tensor + %7 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel"]} outs(%6 : tensor) { + ^bb0(%out: f32): + linalg.yield %cst : f32 + } -> tensor + %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%0, %2], strides = [1, 1] : !flow.dispatch.tensor>{%0, %2} -> tensor + %9 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %1], strides = [1, 1] : !flow.dispatch.tensor>{%2, %1} -> tensor + %10 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"]} ins(%8, %9 : tensor, tensor) outs(%7 : tensor) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %11 = arith.mulf %in, %in_0 : f32 + %12 = arith.addf %11, %out : f32 + linalg.yield %11 : f32 + } -> tensor + flow.dispatch.tensor.store %10, %5, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} + return } // CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config @@ -395,28 +456,33 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> -module { - func.func @conv_dynamic() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.constant.load[3] : index - %4 = hal.interface.constant.load[4] : index - %5 = hal.interface.constant.load[5] : index - %6 = hal.interface.constant.load[6] : index - %7 = hal.interface.constant.load[7] : index - %8 = hal.interface.constant.load[8] : index - %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1, %2, %3} - %10 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%4, %5, %3, %6} - %11 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor>{%0, %7, %8, %6} - %12 = flow.dispatch.tensor.load %9, offsets = [0, 0, 0, 0], sizes = [%0, %1, %2, %3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%0, %1, %2, %3} -> tensor - %13 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0, 0], sizes = [%4, %5, %3, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %5, %3, %6} -> tensor - %14 = flow.dispatch.tensor.load %11, offsets = [0, 0, 0, 0], sizes = [%0, %7, %8, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%0, %7, %8, %6} -> tensor - %15 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%12, %13 : tensor, tensor) outs(%14 : tensor) -> tensor - flow.dispatch.tensor.store %15, %11, offsets = [0, 0, 0, 0], sizes = [%0, %7, %8, %6], strides = [1, 1, 1, 1] : tensor -> !flow.dispatch.tensor>{%0, %7, %8, %6} - return - } +func.func @conv_dynamic() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : index + %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : index + %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : index + %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : index + %8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : index + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %1, %2, %3} + %10 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%4, %5, %3, %6} + %11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor>{%0, %7, %8, %6} + %12 = flow.dispatch.tensor.load %9, offsets = [0, 0, 0, 0], sizes = [%0, %1, %2, %3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%0, %1, %2, %3} -> tensor + %13 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0, 0], sizes = [%4, %5, %3, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %5, %3, %6} -> tensor + %14 = flow.dispatch.tensor.load %11, offsets = [0, 0, 0, 0], sizes = [%0, %7, %8, %6], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%0, %7, %8, %6} -> tensor + %15 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%12, %13 : tensor, tensor) outs(%14 : tensor) -> tensor + flow.dispatch.tensor.store %15, %11, offsets = [0, 0, 0, 0], sizes = [%0, %7, %8, %6], strides = [1, 1, 1, 1] : tensor -> !flow.dispatch.tensor>{%0, %7, %8, %6} + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -428,23 +494,27 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> -module { - func.func @conv_static() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %c607520 = arith.constant 607520 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c607520) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x225x225x3xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x3x16xf32> - %5 = tensor.empty() : tensor<1x112x112x16xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32> - %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x16xf32>) outs(%6 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 16], strides = [1, 1, 1, 1] : tensor<1x112x112x16xf32> -> !flow.dispatch.tensor> - return - } +func.func @conv_static() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %c607520 = arith.constant 607520 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) offset(%c607520) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x225x225x3xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x3x16xf32> + %5 = tensor.empty() : tensor<1x112x112x16xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32> + %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x16xf32>) outs(%6 : tensor<1x112x112x16xf32>) -> tensor<1x112x112x16xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 16], strides = [1, 1, 1, 1] : tensor<1x112x112x16xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -455,22 +525,27 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @conv_nchw_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 128, 30, 30], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x128x30x30xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [128, 128, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x128x3x3xf32> - %5 = tensor.empty() : tensor<1x128x28x28xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32> - %7 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%3, %4 : tensor<1x128x30x30xf32>, tensor<128x128x3x3xf32>) outs(%6 : tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 128, 28, 28], strides = [1, 1, 1, 1] : tensor<1x128x28x28xf32> -> !flow.dispatch.tensor> - return - } +func.func @conv_nchw_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 128, 30, 30], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x128x30x30xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [128, 128, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x128x3x3xf32> + %5 = tensor.empty() : tensor<1x128x28x28xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32> + %7 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%3, %4 : tensor<1x128x30x30xf32>, tensor<128x128x3x3xf32>) outs(%6 : tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 128, 28, 28], strides = [1, 1, 1, 1] : tensor<1x128x28x28xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -481,21 +556,26 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-linux-gnu"}> -module { - func.func @depthwise_conv_static() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 161, 161, 240], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x161x161x240xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 240], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x240xf32> - %5 = tensor.empty() : tensor<1x80x80x240xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x80x80x240xf32>) -> tensor<1x80x80x240xf32> - %7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x161x161x240xf32>, tensor<3x3x240xf32>) outs(%6 : tensor<1x80x80x240xf32>) -> tensor<1x80x80x240xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 80, 80, 240], strides = [1, 1, 1, 1] : tensor<1x80x80x240xf32> -> !flow.dispatch.tensor> - return - } +func.func @depthwise_conv_static() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 161, 161, 240], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x161x161x240xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 240], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x240xf32> + %5 = tensor.empty() : tensor<1x80x80x240xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x80x80x240xf32>) -> tensor<1x80x80x240xf32> + %7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x161x161x240xf32>, tensor<3x3x240xf32>) outs(%6 : tensor<1x80x80x240xf32>) -> tensor<1x80x80x240xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 80, 80, 240], strides = [1, 1, 1, 1] : tensor<1x80x80x240xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -507,21 +587,26 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-linux-gnu"}> -module { - func.func @thin_depthwise_conv_static() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 161, 161, 240], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x57x57x72xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 240], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x72xf32> - %5 = tensor.empty() : tensor<1x28x28x72xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x28x28x72xf32>) -> tensor<1x28x28x72xf32> - %7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x57x57x72xf32>, tensor<3x3x72xf32>) outs(%6 : tensor<1x28x28x72xf32>) -> tensor<1x28x28x72xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 72], strides = [1, 1, 1, 1] : tensor<1x28x28x72xf32> -> !flow.dispatch.tensor> - return - } +func.func @thin_depthwise_conv_static() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 161, 161, 240], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x57x57x72xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 240], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x72xf32> + %5 = tensor.empty() : tensor<1x28x28x72xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x28x28x72xf32>) -> tensor<1x28x28x72xf32> + %7 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x57x57x72xf32>, tensor<3x3x72xf32>) outs(%6 : tensor<1x28x28x72xf32>) -> tensor<1x28x28x72xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 72], strides = [1, 1, 1, 1] : tensor<1x28x28x72xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -533,22 +618,26 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vnni,+adx,+clflushopt,+clwb,+cx16,+cx8,+crc32,+f16c,+fsgsbase,+fxsr,+invpcid,+lzcnt,+movbe,+pku,+prfchw,+rdrnd,+rdseed,+sahf,+x87,+xsave,+xsavec,+xsaveopt,+xsaves", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-none-elf", ukernels = false}> -module { - func.func @pooling_nchw_max() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c3846080 = arith.constant 3846080 : index - %c0 = arith.constant 0 : index - %cst = arith.constant -3.40282347E+38 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c3846080) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 64, 114, 114], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x64x114x114xf32> - %3 = tensor.empty() : tensor<1x64x56x56xf32> - %4 = tensor.empty() : tensor<3x3xf32> - %5 = linalg.fill ins(%cst : f32) outs(%3 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32> - %6 = linalg.pooling_nchw_max {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%2, %4 : tensor<1x64x114x114xf32>, tensor<3x3xf32>) outs(%5 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32> - flow.dispatch.tensor.store %6, %1, offsets = [0, 0, 0, 0], sizes = [1, 64, 56, 56], strides = [1, 1, 1, 1] : tensor<1x64x56x56xf32> -> !flow.dispatch.tensor> - return - } +func.func @pooling_nchw_max() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c3846080 = arith.constant 3846080 : index + %c0 = arith.constant 0 : index + %cst = arith.constant -3.40282347E+38 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c3846080) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 64, 114, 114], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x64x114x114xf32> + %3 = tensor.empty() : tensor<1x64x56x56xf32> + %4 = tensor.empty() : tensor<3x3xf32> + %5 = linalg.fill ins(%cst : f32) outs(%3 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32> + %6 = linalg.pooling_nchw_max {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%2, %4 : tensor<1x64x114x114xf32>, tensor<3x3xf32>) outs(%5 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32> + flow.dispatch.tensor.store %6, %1, offsets = [0, 0, 0, 0], sizes = [1, 64, 56, 56], strides = [1, 1, 1, 1] : tensor<1x64x56x56xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -560,22 +649,26 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-pc-linux-gnu"}> #map = affine_map<(d0, d1) -> (d1, d0)> #map1 = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @generic_static() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [96, 16], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<96x16xf32> - %3 = tensor.empty() : tensor<16x96xf32> - %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<96x16xf32>) outs(%3 : tensor<16x96xf32>) { - ^bb0(%in: f32, %out: f32): - linalg.yield %in : f32 - } -> tensor<16x96xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [16, 96], strides = [1, 1] : tensor<16x96xf32> -> !flow.dispatch.tensor> - return - } +func.func @generic_static() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [96, 16], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<96x16xf32> + %3 = tensor.empty() : tensor<16x96xf32> + %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<96x16xf32>) outs(%3 : tensor<16x96xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<16x96xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [16, 96], strides = [1, 1] : tensor<16x96xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -586,21 +679,26 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @matmul_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<384x512xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x128xf32> - %5 = tensor.empty() : tensor<384x128xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<384x128xf32>) -> tensor<384x128xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<384x512xf32>, tensor<512x128xf32>) outs(%6 : tensor<384x128xf32>) -> tensor<384x128xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : tensor<384x128xf32> -> !flow.dispatch.tensor> - return - } +func.func @matmul_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<384x512xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x128xf32> + %5 = tensor.empty() : tensor<384x128xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<384x128xf32>) -> tensor<384x128xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<384x512xf32>, tensor<512x128xf32>) outs(%6 : tensor<384x128xf32>) -> tensor<384x128xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [384, 128], strides = [1, 1] : tensor<384x128xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -647,22 +745,27 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @matmul_i8_i8_i32_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0_i32 = arith.constant 0 : i32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x384xi8> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [384, 1536], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<384x1536xi8> - %5 = tensor.empty() : tensor<128x1536xi32> - %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<128x1536xi32>) -> tensor<128x1536xi32> - %7 = linalg.matmul ins(%3, %4 : tensor<128x384xi8>, tensor<384x1536xi8>) outs(%6 : tensor<128x1536xi32>) -> tensor<128x1536xi32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1536], strides = [1, 1] : tensor<128x1536xi32> -> !flow.dispatch.tensor> - return - } +func.func @matmul_i8_i8_i32_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x384xi8> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [384, 1536], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<384x1536xi8> + %5 = tensor.empty() : tensor<128x1536xi32> + %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<128x1536xi32>) -> tensor<128x1536xi32> + %7 = linalg.matmul ins(%3, %4 : tensor<128x384xi8>, tensor<384x1536xi8>) outs(%6 : tensor<128x1536xi32>) -> tensor<128x1536xi32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1536], strides = [1, 1] : tensor<128x1536xi32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -674,22 +777,27 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @gemm_unit_N() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0, %1} - %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%1} - %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0} - %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%1, 1], strides = [1, 1] : !flow.dispatch.tensor>{%1} -> tensor - %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor - %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, 1], strides = [1, 1] : !flow.dispatch.tensor>{%0} -> tensor - %8 = linalg.matmul ins(%6, %5 : tensor, tensor) outs(%7 : tensor) -> tensor - flow.dispatch.tensor.store %8, %4, offsets = [0, 0], sizes = [%0, 1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0} - return - } +func.func @gemm_unit_N() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0, %1} + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%1} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0} + %5 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [%1, 1], strides = [1, 1] : !flow.dispatch.tensor>{%1} -> tensor + %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor + %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, 1], strides = [1, 1] : !flow.dispatch.tensor>{%0} -> tensor + %8 = linalg.matmul ins(%6, %5 : tensor, tensor) outs(%7 : tensor) -> tensor + flow.dispatch.tensor.store %8, %4, offsets = [0, 0], sizes = [%0, 1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0} + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -701,21 +809,26 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @gemm_unit_M_unit_N() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0} - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0} - %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, %0], strides = [1, 1] : !flow.dispatch.tensor>{%0} -> tensor<1x?xf32> - %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, 1], strides = [1, 1] : !flow.dispatch.tensor>{%0} -> tensor - %6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1xf32> - %7 = linalg.matmul ins(%4, %5 : tensor<1x?xf32>, tensor) outs(%6 : tensor<1x1xf32>) -> tensor<1x1xf32> - flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor> - return - } +func.func @gemm_unit_M_unit_N() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0} + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) offset(%c0) : !flow.dispatch.tensor>{%0} + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(32) offset(%c0) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, %0], strides = [1, 1] : !flow.dispatch.tensor>{%0} -> tensor<1x?xf32> + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, 1], strides = [1, 1] : !flow.dispatch.tensor>{%0} -> tensor + %6 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1xf32> + %7 = linalg.matmul ins(%4, %5 : tensor<1x?xf32>, tensor) outs(%6 : tensor<1x1xf32>) -> tensor<1x1xf32> + flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -727,24 +840,30 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @matmul_odd() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [33, 16], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<33x16xf32> - %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [16, 49], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<16x49xf32> - %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [33, 49], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<33x49xf32> - %7 = tensor.empty() : tensor<33x49xf32> - %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<33x49xf32>) -> tensor<33x49xf32> - %9 = linalg.matmul ins(%4, %5 : tensor<33x16xf32>, tensor<16x49xf32>) outs(%8 : tensor<33x49xf32>) -> tensor<33x49xf32> - flow.dispatch.tensor.store %9, %3, offsets = [0, 0], sizes = [33, 49], strides = [1, 1] : tensor<33x49xf32> -> !flow.dispatch.tensor> - return - } +func.func @matmul_odd() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(32) offset(%c0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(32) offset(%c0) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [33, 16], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<33x16xf32> + %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [16, 49], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<16x49xf32> + %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [33, 49], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<33x49xf32> + %7 = tensor.empty() : tensor<33x49xf32> + %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<33x49xf32>) -> tensor<33x49xf32> + %9 = linalg.matmul ins(%4, %5 : tensor<33x16xf32>, tensor<16x49xf32>) outs(%8 : tensor<33x49xf32>) -> tensor<33x49xf32> + flow.dispatch.tensor.store %9, %3, offsets = [0, 0], sizes = [33, 49], strides = [1, 1] : tensor<33x49xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -756,27 +875,31 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> #map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0, d1, d2, d3, d4, d5, d6, d7)> -module { - func.func @generic_unit_dims_dynamic() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.constant.load[3] : index - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1, %2, %3} - %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor>{%0, %1, %2, %3} - %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %0, 1, 1, %1, %2, 1, %3], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor>{%0, %1, %2, %3} -> tensor<1x?x1x1x?x?x1x?xf32> - %7 = tensor.empty(%0, %1, %2, %3) : tensor<1x?x1x1x?x?x1x?xf32> - %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<1x?x1x1x?x?x1x?xf32>) outs(%7 : tensor<1x?x1x1x?x?x1x?xf32>) { - ^bb0(%in: f32, %out: f32): - %9 = arith.addf %in, %in : f32 - linalg.yield %9 : f32 - } -> tensor<1x?x1x1x?x?x1x?xf32> - flow.dispatch.tensor.store %8, %5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %0, 1, 1, %1, %2, 1, %3], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x1x1x?x?x1x?xf32> -> !flow.dispatch.tensor>{%0, %1, %2, %3} - return - } +func.func @generic_unit_dims_dynamic() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0, %1, %2, %3} + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor>{%0, %1, %2, %3} + %6 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %0, 1, 1, %1, %2, 1, %3], strides = [1, 1, 1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor>{%0, %1, %2, %3} -> tensor<1x?x1x1x?x?x1x?xf32> + %7 = tensor.empty(%0, %1, %2, %3) : tensor<1x?x1x1x?x?x1x?xf32> + %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<1x?x1x1x?x?x1x?xf32>) outs(%7 : tensor<1x?x1x1x?x?x1x?xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = arith.addf %in, %in : f32 + linalg.yield %9 : f32 + } -> tensor<1x?x1x1x?x?x1x?xf32> + flow.dispatch.tensor.store %8, %5, offsets = [0, 0, 0, 0, 0, 0, 0, 0], sizes = [1, %0, 1, 1, %1, %2, 1, %3], strides = [1, 1, 1, 1, 1, 1, 1, 1] : tensor<1x?x1x1x?x?x1x?xf32> -> !flow.dispatch.tensor>{%0, %1, %2, %3} + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -787,26 +910,30 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> #map = affine_map<(d0) -> (d0)> #map1 = affine_map<(d0) -> ()> -module { - func.func @reduce_to_scalar_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor> -> tensor<128xf32> - %3 = tensor.empty() : tensor - %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor) -> tensor - %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["reduction"]} ins(%2 : tensor<128xf32>) outs(%4 : tensor) { - ^bb0(%in: f32, %out: f32): - %6 = arith.addf %in, %out : f32 - linalg.yield %6 : f32 - } -> tensor - flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> - return - } +func.func @reduce_to_scalar_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor> -> tensor<128xf32> + %3 = tensor.empty() : tensor + %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor) -> tensor + %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["reduction"]} ins(%2 : tensor<128xf32>) outs(%4 : tensor) { + ^bb0(%in: f32, %out: f32): + %6 = arith.addf %in, %out : f32 + linalg.yield %6 : f32 + } -> tensor + flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -818,25 +945,29 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> #map = affine_map<(d0) -> (d0)> #map1 = affine_map<(d0) -> ()> -module { - func.func @reduce_to_scalar_dynamic() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor>{%0} - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [%0], strides = [1] : !flow.dispatch.tensor>{%0} -> tensor - %4 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor - %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["reduction"]} ins(%3 : tensor) outs(%4 : tensor) { - ^bb0(%in: f32, %out: f32): - %6 = arith.addf %in, %out : f32 - linalg.yield %6 : f32 - } -> tensor - flow.dispatch.tensor.store %5, %2, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> - return - } +func.func @reduce_to_scalar_dynamic() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor>{%0} + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [%0], strides = [1] : !flow.dispatch.tensor>{%0} -> tensor + %4 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor + %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["reduction"]} ins(%3 : tensor) outs(%4 : tensor) { + ^bb0(%in: f32, %out: f32): + %6 = arith.addf %in, %out : f32 + linalg.yield %6 : f32 + } -> tensor + flow.dispatch.tensor.store %5, %2, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -847,23 +978,27 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}> #map = affine_map<() -> ()> -module { - func.func @scalar() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor - %3 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor - %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%2 : tensor) outs(%3 : tensor) { - ^bb0(%in: f32, %out: f32): - %5 = arith.addf %in, %out : f32 - linalg.yield %5 : f32 - } -> tensor - flow.dispatch.tensor.store %4, %1, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> - return - } +func.func @scalar() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor + %3 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = []} ins(%2 : tensor) outs(%3 : tensor) { + ^bb0(%in: f32, %out: f32): + %5 = arith.addf %in, %out : f32 + linalg.yield %5 : f32 + } -> tensor + flow.dispatch.tensor.store %4, %1, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info // CHECK: func.func @scalar() @@ -871,24 +1006,28 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1) -> (d1, d0)> #map1 = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @transpose_8x8() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x1024xf32> - %3 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x512xf32> - %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<512x1024xf32>) outs(%3 : tensor<1024x512xf32>) { - ^bb0(%in: f32, %out: f32): - linalg.yield %in : f32 - } -> tensor<1024x512xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xf32> -> !flow.dispatch.tensor> - return - } +func.func @transpose_8x8() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x1024xf32> + %3 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x512xf32> + %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<512x1024xf32>) outs(%3 : tensor<1024x512xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1024x512xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -896,24 +1035,28 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx2,+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1) -> (d1, d0)> #map1 = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @transpose_16x16() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x1024xf32> - %3 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x512xf32> - %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<512x1024xf32>) outs(%3 : tensor<1024x512xf32>) { - ^bb0(%in: f32, %out: f32): - linalg.yield %in : f32 - } -> tensor<1024x512xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xf32> -> !flow.dispatch.tensor> - return - } +func.func @transpose_16x16() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x1024xf32> + %3 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x512xf32> + %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<512x1024xf32>) outs(%3 : tensor<1024x512xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1024x512xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : tensor<1024x512xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -921,37 +1064,42 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> -module { - func.func @multi_root() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %c6144 = arith.constant 6144 : index - %c792576 = arith.constant 792576 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c792576) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<12x128x128xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [12, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<12x128xf32> - %5 = tensor.empty() : tensor<12x128xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<12x128xf32>) -> tensor<12x128xf32> - %7 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3 : tensor<12x128x128xf32>) outs(%4 : tensor<12x128xf32>) { - ^bb0(%in: f32, %out: f32): - %9 = arith.maximumf %in, %out : f32 - linalg.yield %9 : f32 - } -> tensor<12x128xf32> - %8 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %7 : tensor<12x128x128xf32>, tensor<12x128xf32>) outs(%6 : tensor<12x128xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %9 = arith.subf %in, %in_0 : f32 - %10 = math.exp %9 : f32 - %11 = arith.addf %10, %out : f32 - linalg.yield %11 : f32 - } -> tensor<12x128xf32> - flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [12, 128], strides = [1, 1] : tensor<12x128xf32> -> !flow.dispatch.tensor> - return - } +func.func @multi_root() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %c6144 = arith.constant 6144 : index + %c792576 = arith.constant 792576 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c792576) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<12x128x128xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [12, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<12x128xf32> + %5 = tensor.empty() : tensor<12x128xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<12x128xf32>) -> tensor<12x128xf32> + %7 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3 : tensor<12x128x128xf32>) outs(%4 : tensor<12x128xf32>) { + ^bb0(%in: f32, %out: f32): + %9 = arith.maximumf %in, %out : f32 + linalg.yield %9 : f32 + } -> tensor<12x128xf32> + %8 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %7 : tensor<12x128x128xf32>, tensor<12x128xf32>) outs(%6 : tensor<12x128xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %9 = arith.subf %in, %in_0 : f32 + %10 = math.exp %9 : f32 + %11 = arith.addf %10, %out : f32 + linalg.yield %11 : f32 + } -> tensor<12x128xf32> + flow.dispatch.tensor.store %8, %2, offsets = [0, 0], sizes = [12, 128], strides = [1, 1] : tensor<12x128xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @pack() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [20, 40], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<20x40xf32> - %3 = tensor.empty() : tensor<2x48x16x1xf32> - %pack = tensor.pack %2 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %3 : tensor<20x40xf32> -> tensor<2x48x16x1xf32> - flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [2, 48, 16, 1], strides = [1, 1, 1, 1] : tensor<2x48x16x1xf32> -> !flow.dispatch.tensor> - return - } +func.func @pack() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [20, 40], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<20x40xf32> + %3 = tensor.empty() : tensor<2x48x16x1xf32> + %pack = tensor.pack %2 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %3 : tensor<20x40xf32> -> tensor<2x48x16x1xf32> + flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [2, 48, 16, 1], strides = [1, 1, 1, 1] : tensor<2x48x16x1xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -992,19 +1144,23 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @pack_f16() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [20, 40], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<20x40xf16> - %3 = tensor.empty() : tensor<2x48x16x1xf16> - %pack = tensor.pack %2 padding_value(%cst : f16) inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %3 : tensor<20x40xf16> -> tensor<2x48x16x1xf16> - flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [2, 48, 16, 1], strides = [1, 1, 1, 1] : tensor<2x48x16x1xf16> -> !flow.dispatch.tensor> - return - } +func.func @pack_f16() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [20, 40], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<20x40xf16> + %3 = tensor.empty() : tensor<2x48x16x1xf16> + %pack = tensor.pack %2 padding_value(%cst : f16) inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %3 : tensor<20x40xf16> -> tensor<2x48x16x1xf16> + flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [2, 48, 16, 1], strides = [1, 1, 1, 1] : tensor<2x48x16x1xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -1016,18 +1172,22 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @pack_many_elements() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1200, 500000], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1200x500000xf32> - %3 = tensor.empty() : tensor<31250x1200x16x1xf32> - %pack = tensor.pack %2 outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %3 : tensor<1200x500000xf32> -> tensor<31250x1200x16x1xf32> - flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [31250, 1200, 16, 1], strides = [1, 1, 1, 1] : tensor<31250x1200x16x1xf32> -> !flow.dispatch.tensor> - return - } +func.func @pack_many_elements() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1200, 500000], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1200x500000xf32> + %3 = tensor.empty() : tensor<31250x1200x16x1xf32> + %pack = tensor.pack %2 outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %3 : tensor<1200x500000xf32> -> tensor<31250x1200x16x1xf32> + flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [31250, 1200, 16, 1], strides = [1, 1, 1, 1] : tensor<31250x1200x16x1xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -1039,33 +1199,38 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1) -> (d1)> #map1 = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @unpack_generic_pack(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %cst = arith.constant 3.40282347E+38 : f32 - %cst_0 = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [24, 32, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<24x32x16x16xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [512], strides = [1] : !flow.dispatch.tensor> -> tensor<512xf32> - %5 = tensor.empty() : tensor<24x512x16x1xf32> - %6 = tensor.empty() : tensor<384x512xf32> - %unpack = tensor.unpack %3 inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %6 : tensor<24x32x16x16xf32> -> tensor<384x512xf32> - %7 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%4, %unpack : tensor<512xf32>, tensor<384x512xf32>) outs(%6 : tensor<384x512xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %8 = arith.addf %in, %in_1 : f32 - %9 = arith.minimumf %8, %cst : f32 - %10 = arith.maximumf %9, %cst_0 : f32 - linalg.yield %10 : f32 - } -> tensor<384x512xf32> - %pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %5 : tensor<384x512xf32> -> tensor<24x512x16x1xf32> - flow.dispatch.tensor.store %pack, %2, offsets = [0, 0, 0, 0], sizes = [24, 512, 16, 1], strides = [1, 1, 1, 1] : tensor<24x512x16x1xf32> -> !flow.dispatch.tensor> - return - } +func.func @unpack_generic_pack(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %cst = arith.constant 3.40282347E+38 : f32 + %cst_0 = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [24, 32, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<24x32x16x16xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [512], strides = [1] : !flow.dispatch.tensor> -> tensor<512xf32> + %5 = tensor.empty() : tensor<24x512x16x1xf32> + %6 = tensor.empty() : tensor<384x512xf32> + %unpack = tensor.unpack %3 inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %6 : tensor<24x32x16x16xf32> -> tensor<384x512xf32> + %7 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%4, %unpack : tensor<512xf32>, tensor<384x512xf32>) outs(%6 : tensor<384x512xf32>) { + ^bb0(%in: f32, %in_1: f32, %out: f32): + %8 = arith.addf %in, %in_1 : f32 + %9 = arith.minimumf %8, %cst : f32 + %10 = arith.maximumf %9, %cst_0 : f32 + linalg.yield %10 : f32 + } -> tensor<384x512xf32> + %pack = tensor.pack %7 inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %5 : tensor<384x512xf32> -> tensor<24x512x16x1xf32> + flow.dispatch.tensor.store %pack, %2, offsets = [0, 0, 0, 0], sizes = [24, 512, 16, 1], strides = [1, 1, 1, 1] : tensor<24x512x16x1xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config @@ -1082,25 +1247,29 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @elem_pack() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x384xf32> - %3 = tensor.empty() : tensor<128x384xf32> - %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<128x384xf32>) outs(%3 : tensor<128x384xf32>) { - ^bb0(%in: f32, %out: f32): - %6 = arith.addf %in, %in : f32 - linalg.yield %6 : f32 - } -> tensor<128x384xf32> - %5 = tensor.empty() : tensor<16x384x8x1xf32> - %pack = tensor.pack %4 inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %5 : tensor<128x384xf32> -> tensor<16x384x8x1xf32> - flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [16, 384, 8, 1], strides = [1, 1, 1, 1] : tensor<16x384x8x1xf32> -> !flow.dispatch.tensor> - return - } +func.func @elem_pack() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x384xf32> + %3 = tensor.empty() : tensor<128x384xf32> + %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<128x384xf32>) outs(%3 : tensor<128x384xf32>) { + ^bb0(%in: f32, %out: f32): + %6 = arith.addf %in, %in : f32 + linalg.yield %6 : f32 + } -> tensor<128x384xf32> + %5 = tensor.empty() : tensor<16x384x8x1xf32> + %pack = tensor.pack %4 inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %5 : tensor<128x384xf32> -> tensor<16x384x8x1xf32> + flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [16, 384, 8, 1], strides = [1, 1, 1, 1] : tensor<16x384x8x1xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config @@ -1115,27 +1284,31 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf", ukernels = false}> #map = affine_map<(d0, d1) -> (d1, d0)> #map1 = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @transpose_pack() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c1579008 = arith.constant 1579008 : index - %c3147776 = arith.constant 3147776 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c1579008) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c3147776) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [30522, 768], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<30522x768xf32> - %3 = tensor.empty() : tensor<768x30522xf32> - %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<30522x768xf32>) outs(%3 : tensor<768x30522xf32>) { - ^bb0(%in: f32, %out: f32): - linalg.yield %in : f32 - } -> tensor<768x30522xf32> - %5 = tensor.empty() : tensor<1908x768x16x1xf32> - %pack = tensor.pack %4 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %5 : tensor<768x30522xf32> -> tensor<1908x768x16x1xf32> - flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [1908, 768, 16, 1], strides = [1, 1, 1, 1] : tensor<1908x768x16x1xf32> -> !flow.dispatch.tensor> - return - } +func.func @transpose_pack() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c1579008 = arith.constant 1579008 : index + %c3147776 = arith.constant 3147776 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c1579008) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c3147776) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [30522, 768], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<30522x768xf32> + %3 = tensor.empty() : tensor<768x30522xf32> + %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<30522x768xf32>) outs(%3 : tensor<768x30522xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<768x30522xf32> + %5 = tensor.empty() : tensor<1908x768x16x1xf32> + %pack = tensor.pack %4 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %5 : tensor<768x30522xf32> -> tensor<1908x768x16x1xf32> + flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [1908, 768, 16, 1], strides = [1, 1, 1, 1] : tensor<1908x768x16x1xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config @@ -1150,52 +1323,59 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf", ukernels = false}> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> #map2 = affine_map<(d0, d1) -> (d1)> -module { - func.func @reduction_broadcast_pack() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %cst = arith.constant -0.000000e+00 : f32 - %cst_0 = arith.constant 1.024000e+03 : f32 - %cst_1 = arith.constant 9.99999996E-13 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [384, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<384x1024xf32> - %6 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [384], strides = [1] : !flow.dispatch.tensor> -> tensor<384xf32> - %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [1024], strides = [1] : !flow.dispatch.tensor> -> tensor<1024xf32> - %8 = flow.dispatch.tensor.load %3, offsets = [0], sizes = [1024], strides = [1] : !flow.dispatch.tensor> -> tensor<1024xf32> - %9 = tensor.empty() : tensor<24x1024x16x1xf32> - %10 = tensor.empty() : tensor<384x1024xf32> - %11 = tensor.empty() : tensor<384xf32> - %12 = linalg.fill ins(%cst : f32) outs(%11 : tensor<384xf32>) -> tensor<384xf32> - %13 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "reduction"]} ins(%5, %6 : tensor<384x1024xf32>, tensor<384xf32>) outs(%12 : tensor<384xf32>) { - ^bb0(%in: f32, %in_2: f32, %out: f32): - %15 = arith.subf %in, %in_2 : f32 - %16 = arith.mulf %15, %15 : f32 - %17 = arith.addf %out, %16 : f32 - linalg.yield %17 : f32 - } -> tensor<384xf32> - %14 = linalg.generic {indexing_maps = [#map, #map1, #map2, #map2, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %13, %7, %8, %6 : tensor<384x1024xf32>, tensor<384xf32>, tensor<1024xf32>, tensor<1024xf32>, tensor<384xf32>) outs(%10 : tensor<384x1024xf32>) { - ^bb0(%in: f32, %in_2: f32, %in_3: f32, %in_4: f32, %in_5: f32, %out: f32): - %15 = arith.divf %in_2, %cst_0 : f32 - %16 = arith.addf %15, %cst_1 : f32 - %17 = math.rsqrt %16 : f32 - %18 = arith.mulf %17, %in_3 : f32 - %19 = arith.mulf %in_5, %18 : f32 - %20 = arith.subf %in_4, %19 : f32 - %21 = arith.mulf %in, %18 : f32 - %22 = arith.addf %21, %20 : f32 - linalg.yield %22 : f32 - } -> tensor<384x1024xf32> - %pack = tensor.pack %14 inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %9 : tensor<384x1024xf32> -> tensor<24x1024x16x1xf32> - flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [24, 1024, 16, 1], strides = [1, 1, 1, 1] : tensor<24x1024x16x1xf32> -> !flow.dispatch.tensor> - return - } +func.func @reduction_broadcast_pack() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %cst = arith.constant -0.000000e+00 : f32 + %cst_0 = arith.constant 1.024000e+03 : f32 + %cst_1 = arith.constant 9.99999996E-13 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [384, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<384x1024xf32> + %6 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [384], strides = [1] : !flow.dispatch.tensor> -> tensor<384xf32> + %7 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [1024], strides = [1] : !flow.dispatch.tensor> -> tensor<1024xf32> + %8 = flow.dispatch.tensor.load %3, offsets = [0], sizes = [1024], strides = [1] : !flow.dispatch.tensor> -> tensor<1024xf32> + %9 = tensor.empty() : tensor<24x1024x16x1xf32> + %10 = tensor.empty() : tensor<384x1024xf32> + %11 = tensor.empty() : tensor<384xf32> + %12 = linalg.fill ins(%cst : f32) outs(%11 : tensor<384xf32>) -> tensor<384xf32> + %13 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "reduction"]} ins(%5, %6 : tensor<384x1024xf32>, tensor<384xf32>) outs(%12 : tensor<384xf32>) { + ^bb0(%in: f32, %in_2: f32, %out: f32): + %15 = arith.subf %in, %in_2 : f32 + %16 = arith.mulf %15, %15 : f32 + %17 = arith.addf %out, %16 : f32 + linalg.yield %17 : f32 + } -> tensor<384xf32> + %14 = linalg.generic {indexing_maps = [#map, #map1, #map2, #map2, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %13, %7, %8, %6 : tensor<384x1024xf32>, tensor<384xf32>, tensor<1024xf32>, tensor<1024xf32>, tensor<384xf32>) outs(%10 : tensor<384x1024xf32>) { + ^bb0(%in: f32, %in_2: f32, %in_3: f32, %in_4: f32, %in_5: f32, %out: f32): + %15 = arith.divf %in_2, %cst_0 : f32 + %16 = arith.addf %15, %cst_1 : f32 + %17 = math.rsqrt %16 : f32 + %18 = arith.mulf %17, %in_3 : f32 + %19 = arith.mulf %in_5, %18 : f32 + %20 = arith.subf %in_4, %19 : f32 + %21 = arith.mulf %in, %18 : f32 + %22 = arith.addf %21, %20 : f32 + linalg.yield %22 : f32 + } -> tensor<384x1024xf32> + %pack = tensor.pack %14 inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %9 : tensor<384x1024xf32> -> tensor<24x1024x16x1xf32> + flow.dispatch.tensor.store %pack, %4, offsets = [0, 0, 0, 0], sizes = [24, 1024, 16, 1], strides = [1, 1, 1, 1] : tensor<24x1024x16x1xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config @@ -1216,33 +1396,38 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+avx2", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf", ukernels = false}> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> -module { - func.func @reduction_pack() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %cst = arith.constant -0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [384, 1024, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<384x1024x32xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [384, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<384x1024xf32> - %5 = tensor.empty() : tensor<1024x24x16x1xf32> - %6 = tensor.empty() : tensor<384x1024x32xf32> - %7 = tensor.empty() : tensor<384x1024xf32> - %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<384x1024xf32>) -> tensor<384x1024xf32> - %9 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<384x1024x32xf32>, tensor<384x1024xf32>) outs(%8 : tensor<384x1024xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %10 = arith.subf %in, %in_0 : f32 - %11 = arith.mulf %10, %10 : f32 - %12 = arith.addf %out, %11 : f32 - linalg.yield %12 : f32 - } -> tensor<384x1024xf32> - %pack = tensor.pack %9 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %5 : tensor<384x1024xf32> -> tensor<1024x24x16x1xf32> - flow.dispatch.tensor.store %pack, %2, offsets = [0, 0, 0, 0], sizes = [1024, 24, 16, 1], strides = [1, 1, 1, 1] : tensor<1024x24x16x1xf32> -> !flow.dispatch.tensor> - return - } +func.func @reduction_pack() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %cst = arith.constant -0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [384, 1024, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<384x1024x32xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [384, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<384x1024xf32> + %5 = tensor.empty() : tensor<1024x24x16x1xf32> + %6 = tensor.empty() : tensor<384x1024x32xf32> + %7 = tensor.empty() : tensor<384x1024xf32> + %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<384x1024xf32>) -> tensor<384x1024xf32> + %9 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<384x1024x32xf32>, tensor<384x1024xf32>) outs(%8 : tensor<384x1024xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %10 = arith.subf %in, %in_0 : f32 + %11 = arith.mulf %10, %10 : f32 + %12 = arith.addf %out, %11 : f32 + linalg.yield %12 : f32 + } -> tensor<384x1024xf32> + %pack = tensor.pack %9 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %5 : tensor<384x1024xf32> -> tensor<1024x24x16x1xf32> + flow.dispatch.tensor.store %pack, %2, offsets = [0, 0, 0, 0], sizes = [1024, 24, 16, 1], strides = [1, 1, 1, 1] : tensor<1024x24x16x1xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config @@ -1260,19 +1445,23 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @unpack_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c41943040 = arith.constant 41943040 : index - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c41943040) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [64, 256, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x256x16x16xf32> - %3 = tensor.empty() : tensor<1024x4096xf32> - %unpack = tensor.unpack %2 inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %3 : tensor<64x256x16x16xf32> -> tensor<1024x4096xf32> - flow.dispatch.tensor.store %unpack, %1, offsets = [0, 0], sizes = [1024, 4096], strides = [1, 1] : tensor<1024x4096xf32> -> !flow.dispatch.tensor> - return - } +func.func @unpack_static() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c41943040 = arith.constant 41943040 : index + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c41943040) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [64, 256, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x256x16x16xf32> + %3 = tensor.empty() : tensor<1024x4096xf32> + %unpack = tensor.unpack %2 inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %3 : tensor<64x256x16x16xf32> -> tensor<1024x4096xf32> + flow.dispatch.tensor.store %unpack, %1, offsets = [0, 0], sizes = [1024, 4096], strides = [1, 1] : tensor<1024x4096xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -1284,29 +1473,34 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1) -> (d0)> #map1 = affine_map<(d0, d1) -> (d1, d0)> #map2 = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @unpack_elem() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [48, 64, 8, 2], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<48x64x8x2xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor> -> tensor<128xf32> - %5 = tensor.empty() : tensor<128x384xf32> - %6 = tensor.empty() : tensor<384x128xf32> - %unpack = tensor.unpack %3 inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %6 : tensor<48x64x8x2xf32> -> tensor<384x128xf32> - %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel"]} ins(%4, %unpack : tensor<128xf32>, tensor<384x128xf32>) outs(%5 : tensor<128x384xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %8 = arith.addf %in, %in_0 : f32 - linalg.yield %8 : f32 - } -> tensor<128x384xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : tensor<128x384xf32> -> !flow.dispatch.tensor> - return - } +func.func @unpack_elem() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [48, 64, 8, 2], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<48x64x8x2xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor> -> tensor<128xf32> + %5 = tensor.empty() : tensor<128x384xf32> + %6 = tensor.empty() : tensor<384x128xf32> + %unpack = tensor.unpack %3 inner_dims_pos = [0, 1] inner_tiles = [8, 2] into %6 : tensor<48x64x8x2xf32> -> tensor<384x128xf32> + %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel"]} ins(%4, %unpack : tensor<128xf32>, tensor<384x128xf32>) outs(%5 : tensor<128x384xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %8 = arith.addf %in, %in_0 : f32 + linalg.yield %8 : f32 + } -> tensor<128x384xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : tensor<128x384xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -1318,37 +1512,43 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> #map = affine_map<(d0, d1) -> (d1)> #map1 = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @quant_model() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %c12_i32 = arith.constant 12 : i32 - %c-128_i32 = arith.constant -128 : i32 - %c127_i32 = arith.constant 127 : i32 - %c0_i32 = arith.constant 0 : i32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(6) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2304, 24], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2304x24xi8> - %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 144], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<24x144xi8> - %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [144], strides = [1] : !flow.dispatch.tensor> -> tensor<144xi32> - %7 = tensor.empty() : tensor<2304x144xi8> - %8 = tensor.empty() : tensor<2304x144xi32> - %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<2304x144xi32>) -> tensor<2304x144xi32> - %10 = linalg.matmul ins(%4, %5 : tensor<2304x24xi8>, tensor<24x144xi8>) outs(%9 : tensor<2304x144xi32>) -> tensor<2304x144xi32> - %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %10 : tensor<144xi32>, tensor<2304x144xi32>) outs(%7 : tensor<2304x144xi8>) { - ^bb0(%in: i32, %in_0: i32, %out: i8): - %12 = arith.subi %in_0, %c12_i32 : i32 - %13 = arith.addi %in, %12 : i32 - %14 = arith.trunci %13 : i32 to i8 - linalg.yield %14 : i8 - } -> tensor<2304x144xi8> - flow.dispatch.tensor.store %11, %3, offsets = [0, 0], sizes = [2304, 144], strides = [1, 1] : tensor<2304x144xi8> -> !flow.dispatch.tensor> - return - } +func.func @quant_model() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %c12_i32 = arith.constant 12 : i32 + %c-128_i32 = arith.constant -128 : i32 + %c127_i32 = arith.constant 127 : i32 + %c0_i32 = arith.constant 0 : i32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2304, 24], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2304x24xi8> + %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [24, 144], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<24x144xi8> + %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [144], strides = [1] : !flow.dispatch.tensor> -> tensor<144xi32> + %7 = tensor.empty() : tensor<2304x144xi8> + %8 = tensor.empty() : tensor<2304x144xi32> + %9 = linalg.fill ins(%c0_i32 : i32) outs(%8 : tensor<2304x144xi32>) -> tensor<2304x144xi32> + %10 = linalg.matmul ins(%4, %5 : tensor<2304x24xi8>, tensor<24x144xi8>) outs(%9 : tensor<2304x144xi32>) -> tensor<2304x144xi32> + %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%6, %10 : tensor<144xi32>, tensor<2304x144xi32>) outs(%7 : tensor<2304x144xi8>) { + ^bb0(%in: i32, %in_0: i32, %out: i8): + %12 = arith.subi %in_0, %c12_i32 : i32 + %13 = arith.addi %in, %12 : i32 + %14 = arith.trunci %13 : i32 to i8 + linalg.yield %14 : i8 + } -> tensor<2304x144xi8> + flow.dispatch.tensor.store %11, %3, offsets = [0, 0], sizes = [2304, 144], strides = [1, 1] : tensor<2304x144xi8> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -1360,22 +1560,26 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = false}> -module { - func.func @test() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %c6364136223846793005_i64 = arith.constant 6364136223846793005 : i64 - %c1442695040888963407_i64 = arith.constant 1442695040888963407 : i64 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor - %extracted = tensor.extract %2[] : tensor - %3 = arith.muli %extracted, %c6364136223846793005_i64 : i64 - %4 = arith.addi %3, %c1442695040888963407_i64 : i64 - %inserted = tensor.insert %4 into %2[] : tensor - flow.dispatch.tensor.store %inserted, %1, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> - return - } +func.func @test() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %c6364136223846793005_i64 = arith.constant 6364136223846793005 : i64 + %c1442695040888963407_i64 = arith.constant 1442695040888963407 : i64 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor + %extracted = tensor.extract %2[] : tensor + %3 = arith.muli %extracted, %c6364136223846793005_i64 : i64 + %4 = arith.addi %3, %c1442695040888963407_i64 : i64 + %inserted = tensor.insert %4 into %2[] : tensor + flow.dispatch.tensor.store %inserted, %1, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> + return } // CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -1384,33 +1588,38 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_system_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "system-elf-x86_64", {cpu = "cascadelake", cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", link_embedded = false, native_vector_size = 64 : index, target_triple = "x86_64-unknown-linux-gnu", ukernels = false}> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> -module { - func.func @non_trivial_program() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [128, 1, 128, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x1x128x1xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x1xf32> - %5 = tensor.empty() : tensor<1x1xf32> - %6 = tensor.empty() : tensor<128xf32> - %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<128xf32>) -> tensor<128xf32> - %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x1xf32>) -> tensor<1x1xf32> - %collapsed = tensor.collapse_shape %3 [[0, 1], [2, 3]] : tensor<128x1x128x1xf32> into tensor<128x128xf32> - %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%collapsed : tensor<128x128xf32>) outs(%7 : tensor<128xf32>) { - ^bb0(%in: f32, %out: f32): - %11 = arith.addf %out, %in : f32 - linalg.yield %11 : f32 - } -> tensor<128xf32> - %expanded = tensor.expand_shape %9 [[0, 1]] output_shape [1, 128] : tensor<128xf32> into tensor<1x128xf32> - %10 = linalg.matmul ins(%expanded, %4 : tensor<1x128xf32>, tensor<128x1xf32>) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32> - flow.dispatch.tensor.store %10, %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor> - return - } +func.func @non_trivial_program() attributes {hal.executable.target = #executable_target_system_elf_x86_64_} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [128, 1, 128, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x1x128x1xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x1xf32> + %5 = tensor.empty() : tensor<1x1xf32> + %6 = tensor.empty() : tensor<128xf32> + %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<128xf32>) -> tensor<128xf32> + %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x1xf32>) -> tensor<1x1xf32> + %collapsed = tensor.collapse_shape %3 [[0, 1], [2, 3]] : tensor<128x1x128x1xf32> into tensor<128x128xf32> + %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%collapsed : tensor<128x128xf32>) outs(%7 : tensor<128xf32>) { + ^bb0(%in: f32, %out: f32): + %11 = arith.addf %out, %in : f32 + linalg.yield %11 : f32 + } -> tensor<128xf32> + %expanded = tensor.expand_shape %9 [[0, 1]] output_shape [1, 128] : tensor<128xf32> into tensor<1x128xf32> + %10 = linalg.matmul ins(%expanded, %4 : tensor<1x128xf32>, tensor<128x1xf32>) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32> + flow.dispatch.tensor.store %10, %2, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -1422,37 +1631,42 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = true}> -module { - func.func @batch_mmt4d() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c32_i64 = arith.constant 32 : i64 - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = arith.extui %0 : i32 to i64 - %5 = arith.extui %1 : i32 to i64 - %6 = arith.shli %5, %c32_i64 : i64 - %7 = arith.ori %4, %6 : i64 - %8 = arith.index_castui %7 {stream.alignment = 64 : index} : i64 to index - %9 = arith.extui %2 : i32 to i64 - %10 = arith.extui %3 : i32 to i64 - %11 = arith.shli %10, %c32_i64 : i64 - %12 = arith.ori %9, %11 : i64 - %13 = arith.index_castui %12 : i64 to index - %14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %15 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor> - %16 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%13) : !flow.dispatch.tensor> - %17 = flow.dispatch.tensor.load %14, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 32, 8, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x10x32x8x1xf32> - %18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0, 0, 0], sizes = [128, 80, 32, 4, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x80x32x4x1xf32> - %19 = tensor.empty() : tensor<128x10x80x8x4xf32> - %20 = linalg.fill ins(%cst : f32) outs(%19 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32> - %21 = linalg.batch_mmt4d ins(%17, %18 : tensor<128x10x32x8x1xf32>, tensor<128x80x32x4x1xf32>) outs(%20 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32> - flow.dispatch.tensor.store %21, %16, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 80, 8, 4], strides = [1, 1, 1, 1, 1] : tensor<128x10x80x8x4xf32> -> !flow.dispatch.tensor> - return - } +func.func @batch_mmt4d() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c32_i64 = arith.constant 32 : i64 + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 + %4 = arith.extui %0 : i32 to i64 + %5 = arith.extui %1 : i32 to i64 + %6 = arith.shli %5, %c32_i64 : i64 + %7 = arith.ori %4, %6 : i64 + %8 = arith.index_castui %7 {stream.alignment = 64 : index} : i64 to index + %9 = arith.extui %2 : i32 to i64 + %10 = arith.extui %3 : i32 to i64 + %11 = arith.shli %10, %c32_i64 : i64 + %12 = arith.ori %9, %11 : i64 + %13 = arith.index_castui %12 : i64 to index + %14 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %15 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor> + %16 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%13) : !flow.dispatch.tensor> + %17 = flow.dispatch.tensor.load %14, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 32, 8, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x10x32x8x1xf32> + %18 = flow.dispatch.tensor.load %15, offsets = [0, 0, 0, 0, 0], sizes = [128, 80, 32, 4, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x80x32x4x1xf32> + %19 = tensor.empty() : tensor<128x10x80x8x4xf32> + %20 = linalg.fill ins(%cst : f32) outs(%19 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32> + %21 = linalg.batch_mmt4d ins(%17, %18 : tensor<128x10x32x8x1xf32>, tensor<128x80x32x4x1xf32>) outs(%20 : tensor<128x10x80x8x4xf32>) -> tensor<128x10x80x8x4xf32> + flow.dispatch.tensor.store %21, %16, offsets = [0, 0, 0, 0, 0], sizes = [128, 10, 80, 8, 4], strides = [1, 1, 1, 1, 1] : tensor<128x10x80x8x4xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf"}> -module { - func.func @mmt4d_with_large_reduction() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [7, 18176, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<7x18176x16x1xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [284, 18176, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<284x18176x16x1xf32> - %5 = tensor.empty() : tensor<7x284x16x16xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<7x284x16x16xf32>) -> tensor<7x284x16x16xf32> - %7 = linalg.mmt4d ins(%3, %4 : tensor<7x18176x16x1xf32>, tensor<284x18176x16x1xf32>) outs(%6 : tensor<7x284x16x16xf32>) -> tensor<7x284x16x16xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [7, 284, 16, 16], strides = [1, 1, 1, 1] : tensor<7x284x16x16xf32> -> !flow.dispatch.tensor> - return - } +func.func @mmt4d_with_large_reduction() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [7, 18176, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<7x18176x16x1xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [284, 18176, 16, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<284x18176x16x1xf32> + %5 = tensor.empty() : tensor<7x284x16x16xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<7x284x16x16xf32>) -> tensor<7x284x16x16xf32> + %7 = linalg.mmt4d ins(%3, %4 : tensor<7x18176x16x1xf32>, tensor<284x18176x16x1xf32>) outs(%6 : tensor<7x284x16x16xf32>) -> tensor<7x284x16x16xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [7, 284, 16, 16], strides = [1, 1, 1, 1] : tensor<7x284x16x16xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -1487,22 +1706,26 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @pad_only() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c634816 = arith.constant 634816 : index - %c3846080 = arith.constant 3846080 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c634816) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c3846080) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x112x112x64xf32> - %padded = tensor.pad %2 low[0, 1, 1, 0] high[0, 1, 1, 0] { - ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index): - tensor.yield %cst : f32 - } : tensor<1x112x112x64xf32> to tensor<1x114x114x64xf32> - flow.dispatch.tensor.store %padded, %1, offsets = [0, 0, 0, 0], sizes = [1, 114, 114, 64], strides = [1, 1, 1, 1] : tensor<1x114x114x64xf32> -> !flow.dispatch.tensor> - return - } +func.func @pad_only() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c634816 = arith.constant 634816 : index + %c3846080 = arith.constant 3846080 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c634816) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c3846080) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x112x112x64xf32> + %padded = tensor.pad %2 low[0, 1, 1, 0] high[0, 1, 1, 0] { + ^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index): + tensor.yield %cst : f32 + } : tensor<1x112x112x64xf32> to tensor<1x114x114x64xf32> + flow.dispatch.tensor.store %padded, %1, offsets = [0, 0, 0, 0], sizes = [1, 114, 114, 64], strides = [1, 1, 1, 1] : tensor<1x114x114x64xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -1515,21 +1738,25 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", { cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @winograd_output_transform() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 8, 2, 6, 6, 128], strides = [1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x8x2x6x6x128xf16> - %3 = tensor.empty() : tensor<2x36x36x128xf16> - %4 = iree_linalg_ext.winograd.output_transform output_tile_size(6) kernel_size(3) image_dimensions([1, 2]) ins(%2 : tensor<8x8x2x6x6x128xf16>) outs(%3 : tensor<2x36x36x128xf16>) -> tensor<2x36x36x128xf16> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [2, 36, 36, 128], strides = [1, 1, 1, 1] : tensor<2x36x36x128xf16> -> !flow.dispatch.tensor> - return - } +func.func @winograd_output_transform() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 8, 2, 6, 6, 128], strides = [1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x8x2x6x6x128xf16> + %3 = tensor.empty() : tensor<2x36x36x128xf16> + %4 = iree_linalg_ext.winograd.output_transform output_tile_size(6) kernel_size(3) image_dimensions([1, 2]) ins(%2 : tensor<8x8x2x6x6x128xf16>) outs(%3 : tensor<2x36x36x128xf16>) -> tensor<2x36x36x128xf16> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [2, 36, 36, 128], strides = [1, 1, 1, 1] : tensor<2x36x36x128xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -1540,21 +1767,25 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", { cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @winograd_input_transform() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 34, 34, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x34x34x128xf16> - %3 = tensor.empty() : tensor<8x8x2x6x6x128xf16> - %4 = iree_linalg_ext.winograd.input_transform output_tile_size(6) kernel_size(3) image_dimensions([1, 2]) ins(%2 : tensor<2x34x34x128xf16>) outs(%3 : tensor<8x8x2x6x6x128xf16>) -> tensor<8x8x2x6x6x128xf16> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 8, 2, 6, 6, 128], strides = [1, 1, 1, 1, 1, 1] : tensor<8x8x2x6x6x128xf16> -> !flow.dispatch.tensor> - return - } +func.func @winograd_input_transform() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 34, 34, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x34x34x128xf16> + %3 = tensor.empty() : tensor<8x8x2x6x6x128xf16> + %4 = iree_linalg_ext.winograd.input_transform output_tile_size(6) kernel_size(3) image_dimensions([1, 2]) ins(%2 : tensor<2x34x34x128xf16>) outs(%3 : tensor<8x8x2x6x6x128xf16>) -> tensor<8x8x2x6x6x128xf16> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 8, 2, 6, 6, 128], strides = [1, 1, 1, 1, 1, 1] : tensor<8x8x2x6x6x128xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -1565,21 +1796,25 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", { cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @winograd_filter_transform() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [3, 3, 64, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x64x128xf32> - %3 = tensor.empty() : tensor<8x8x64x128xf32> - %4 = iree_linalg_ext.winograd.filter_transform output_tile_size(6) kernel_size(3) kernel_dimensions([0, 1]) ins(%2 : tensor<3x3x64x128xf32>) outs(%3 : tensor<8x8x64x128xf32>) -> tensor<8x8x64x128xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [8, 8, 64, 128], strides = [1, 1, 1, 1] : tensor<8x8x64x128xf32> -> !flow.dispatch.tensor> - return - } +func.func @winograd_filter_transform() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [3, 3, 64, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x64x128xf32> + %3 = tensor.empty() : tensor<8x8x64x128xf32> + %4 = iree_linalg_ext.winograd.filter_transform output_tile_size(6) kernel_size(3) kernel_dimensions([0, 1]) ins(%2 : tensor<3x3x64x128xf32>) outs(%3 : tensor<8x8x64x128xf32>) -> tensor<8x8x64x128xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [8, 8, 64, 128], strides = [1, 1, 1, 1] : tensor<8x8x64x128xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -1590,31 +1825,37 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", { cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @attention() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %scale = arith.constant 0.125 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<20x4096x64xf16> - %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<20x4096x64xf16> - %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<20x4096x64xf16> - %7 = tensor.empty() : tensor<20x4096x64xf16> - %8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, - affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, - affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} - ins(%4, %5, %6, %scale : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) - outs(%7 : tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> - flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor> - return - } +func.func @attention() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %scale = arith.constant 0.125 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<20x4096x64xf16> + %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<20x4096x64xf16> + %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<20x4096x64xf16> + %7 = tensor.empty() : tensor<20x4096x64xf16> + %8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, + affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, + affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, + affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} + ins(%4, %5, %6, %scale : tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, tensor<20x4096x64xf16>, f16) + outs(%7 : tensor<20x4096x64xf16>) -> tensor<20x4096x64xf16> + flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : tensor<20x4096x64xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -1625,31 +1866,37 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", { cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}> -module { - func.func @elementwise_output_transposed() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor - %5 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [768], strides = [1] : !flow.dispatch.tensor> -> tensor<768xi64> - %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xi64> - %7 = tensor.empty() : tensor<32x32x768xf32> - %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> (d1)>, affine_map<(d0, d1, d2) -> (d1, d2, d0)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %5, %6 : tensor, tensor<768xi64>, tensor<32xi64>) outs(%7 : tensor<32x32x768xf32>) { - ^bb0(%in: i64, %in_0: i64, %in_1: i64, %out: f32): - %9 = arith.addi %in, %in_0 : i64 - %10 = arith.addi %9, %in_1 : i64 - %11 = arith.uitofp %10 : i64 to f32 - linalg.yield %11 : f32 - } -> tensor<32x32x768xf32> - flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [32, 32, 768], strides = [1, 1, 1] : tensor<32x32x768xf32> -> !flow.dispatch.tensor> - return - } +func.func @elementwise_output_transposed() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor + %5 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [768], strides = [1] : !flow.dispatch.tensor> -> tensor<768xi64> + %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xi64> + %7 = tensor.empty() : tensor<32x32x768xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> (d1)>, affine_map<(d0, d1, d2) -> (d1, d2, d0)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %5, %6 : tensor, tensor<768xi64>, tensor<32xi64>) outs(%7 : tensor<32x32x768xf32>) { + ^bb0(%in: i64, %in_0: i64, %in_1: i64, %out: f32): + %9 = arith.addi %in, %in_0 : i64 + %10 = arith.addi %9, %in_1 : i64 + %11 = arith.uitofp %10 : i64 to f32 + linalg.yield %11 : f32 + } -> tensor<32x32x768xf32> + flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [32, 32, 768], strides = [1, 1, 1] : tensor<32x32x768xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/tile_and_fuse.mlir index 897a2a7fda50b..284dd8ce4f622 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/tile_and_fuse.mlir @@ -101,19 +101,26 @@ func.func @multi_config(%arg0 : tensor, %arg1 : tensor, %arg2 // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @shared_out_operand() { %cst = arith.constant 0.000000e+00 : f32 %cst_0 = arith.constant 6.000000e+00 : f32 %c600576 = arith.constant 600576 : index %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = arith.index_castui %0 {stream.alignment = 1024 : index, stream.values = [205824 : index, 795648 : index, 1385472 : index, 1975296 : index, 2565120 : index, 3154944 : index, 3744768 : index]} : i32 to index %3 = arith.index_castui %1 {stream.alignment = 1024 : index, stream.values = [0 : index, 3072 : index, 6144 : index, 9216 : index, 12288 : index, 15360 : index, 18432 : index]} : i32 to index - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%2) flags(ReadOnly) : !flow.dispatch.tensor> - %6 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%3) flags(ReadOnly) : !flow.dispatch.tensor> - %7 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c600576) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%2) flags(ReadOnly) : !flow.dispatch.tensor> + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%3) flags(ReadOnly) : !flow.dispatch.tensor> + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c600576) : !flow.dispatch.tensor> %8 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [391, 384], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<391x384xf32> %9 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [384, 384], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<384x384xf32> %10 = flow.dispatch.tensor.load %6, offsets = [0], sizes = [384], strides = [1] : !flow.dispatch.tensor> -> tensor<384xf32> @@ -156,66 +163,6 @@ func.func @shared_out_operand() { // ----- -// This test is to check it doesnt crash. See #15126 -func.func @softmax() { - %c2 = arith.constant 2 : index - %c5 = arith.constant 5 : index - %cst = arith.constant 0xFF800000 : f32 - %c10 = arith.constant 10 : index - %c1 = arith.constant 1 : index - %cst_0 = arith.constant 0.000000e+00 : f32 - %cst_1 = arith.constant -1.000000e+30 : f32 - %c512 = arith.constant 512 : index - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c512) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x10xf32> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x10xf32> - %4 = tensor.empty() : tensor<1xf32> - %5 = linalg.fill {lowering_config = #iree_codegen.lowering_config} ins(%cst_1 : f32) outs(%4 : tensor<1xf32>) -> tensor<1xf32> - %expanded = tensor.expand_shape %3 [[0], [1, 2]] output_shape [1, 5, 2] : tensor<1x10xf32> into tensor<1x5x2xf32> - %6 = tensor.empty() : tensor<1x2xf32> - %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<1x2xf32>) -> tensor<1x2xf32> - %8 = scf.for %arg0 = %c0 to %c5 step %c1 iter_args(%arg1 = %7) -> (tensor<1x2xf32>) { - %extracted_slice = tensor.extract_slice %expanded[0, %arg0, 0] [1, 1, 2] [1, 1, 1] : tensor<1x5x2xf32> to tensor<1x1x2xf32> - %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>], iterator_types = ["parallel", "reduction", "parallel"]} ins(%extracted_slice : tensor<1x1x2xf32>) outs(%arg1 : tensor<1x2xf32>) { - ^bb0(%in: f32, %out: f32): - %14 = arith.maximumf %in, %out : f32 - linalg.yield %14 : f32 - } -> tensor<1x2xf32> - scf.yield %13 : tensor<1x2xf32> - } - %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%8 : tensor<1x2xf32>) outs(%5 : tensor<1xf32>) { - ^bb0(%in: f32, %out: f32): - %13 = arith.maximumf %in, %out : f32 - linalg.yield %13 : f32 - } -> tensor<1xf32> - %10 = linalg.fill {lowering_config = #iree_codegen.lowering_config} ins(%cst_0 : f32) outs(%4 : tensor<1xf32>) -> tensor<1xf32> - %11 = scf.for %arg0 = %c0 to %c10 step %c2 iter_args(%arg1 = %10) -> (tensor<1xf32>) { - %extracted_slice = tensor.extract_slice %3[0, %arg0] [1, 2] [1, 1] : tensor<1x10xf32> to tensor<1x2xf32> - %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%extracted_slice, %9 : tensor<1x2xf32>, tensor<1xf32>) outs(%arg1 : tensor<1xf32>) attrs = {lowering_config = #iree_codegen.lowering_config} { - ^bb0(%in: f32, %in_2: f32, %out: f32): - %14 = arith.subf %in, %in_2 : f32 - %15 = math.exp %14 : f32 - %16 = arith.addf %15, %out : f32 - linalg.yield %16 : f32 - } -> tensor<1xf32> - scf.yield %13 : tensor<1xf32> - } - %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%3, %9, %11 : tensor<1x10xf32>, tensor<1xf32>, tensor<1xf32>) outs(%2 : tensor<1x10xf32>) attrs = {lowering_config = #iree_codegen.lowering_config} { - ^bb0(%in: f32, %in_2: f32, %in_3: f32, %out: f32): - %13 = arith.subf %in, %in_2 : f32 - %14 = math.exp %13 : f32 - %15 = arith.divf %14, %in_3 : f32 - linalg.yield %15 : f32 - } -> tensor<1x10xf32> - flow.dispatch.tensor.store %12, %1, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : tensor<1x10xf32> -> !flow.dispatch.tensor> - return -} -// CHECK-LABEL: func @softmax() - -// ----- - func.func @scalable_matmul(%A: tensor, %B: tensor, %C: tensor) -> tensor{ // Matrix multiplication (ijk) with scalable tiling in the j-th dimension. %1 = linalg.matmul {lowering_config = #iree_codegen.lowering_config} ins(%A, %B: tensor, tensor) diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir index 79ff5c007c933..2c8388bf28b7c 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/transform_dialect_bufferize.mlir @@ -1,10 +1,18 @@ // RUN: iree-opt %s --iree-transform-dialect-interpreter --transform-dialect-drop-schedule | FileCheck %s + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-none-elf"}> func.func @pad_matmul_static_dispatch_0() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [250, 500], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<250x500xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [500, 1020], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<500x1020xf32> %5 = tensor.empty() : tensor<250x1020xf32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_lowering.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_lowering.mlir index 290686134508f..6f2d6fbd07fb9 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_lowering.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vector_lowering.mlir @@ -1,5 +1,13 @@ // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-llvmcpu-vector-lowering-pipeline))" --split-input-file %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> func.func @matmul_391x384x384_f32() { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index @@ -11,13 +19,13 @@ func.func @matmul_391x384x384_f32() { %cst_0 = arith.constant dense<0.000000e+00> : vector<8x32xf32> %cst_1 = arith.constant dense<6.000000e+00> : vector<8x32xf32> %alloca = memref.alloca() {alignment = 64 : i64} : memref<8x32xf32> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<391x384xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<391x384xf32> memref.assume_alignment %0, 64 : memref<391x384xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<384x384xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<384x384xf32> memref.assume_alignment %1, 64 : memref<384x384xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<384xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : memref<384xf32> memref.assume_alignment %2, 64 : memref<384xf32> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : memref<391x384xf32> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : memref<391x384xf32> memref.assume_alignment %3, 64 : memref<391x384xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index @@ -75,6 +83,14 @@ func.func @matmul_391x384x384_f32() { // Check that vector.loads whose elements are extracted and // consumed in a scalar fashion are scalarized. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> func.func @matmul_scalar_loads() { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index @@ -86,13 +102,13 @@ func.func @matmul_scalar_loads() { %cst_0 = arith.constant dense<0.000000e+00> : vector<8x32xf32> %cst_1 = arith.constant dense<6.000000e+00> : vector<8x32xf32> %alloca = memref.alloca() {alignment = 64 : i64} : memref<8x32xf32> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<391x384xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<391x384xf32> memref.assume_alignment %0, 64 : memref<391x384xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<384x384xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<384x384xf32> memref.assume_alignment %1, 64 : memref<384x384xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<384xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : memref<384xf32> memref.assume_alignment %2, 64 : memref<384xf32> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : memref<391x384xf32> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : memref<391x384xf32> memref.assume_alignment %3, 64 : memref<391x384xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index @@ -130,11 +146,16 @@ func.func @matmul_scalar_loads() { // Make sure we don't transpose a mask but create a transposed mask instead. +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @transpose_mask() { %a = arith.constant 4 : index %b = arith.constant 8 : index %c0 = arith.constant 0 : index - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<4x2xi1> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<4x2xi1> %mask = vector.create_mask %a, %b : vector<2x4xi1> %transpose_mask = vector.transpose %mask, [1, 0] : vector<2x4xi1> to vector<4x2xi1> vector.transfer_write %transpose_mask, %3[%c0, %c0] {in_bounds = [true, true]} : vector<4x2xi1>, memref<4x2xi1> @@ -153,17 +174,24 @@ func.func @transpose_mask() { // Make sure that the gather patterns get rid of vector.gather over strided // memref. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @gather_strided_memref() { %cst = arith.constant dense<0.000000e+00> : vector<4xf32> %cst_0 = arith.constant dense : vector<4xi1> %c0_i32 = arith.constant 0 : i32 %c4 = arith.constant 4 : index %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<2592000x3xf32, #hal.descriptor_type> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<2592000x3xf32, #hal.descriptor_type> memref.assume_alignment %0, 64 : memref<2592000x3xf32, #hal.descriptor_type> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<518400xi32, #hal.descriptor_type> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<518400xi32, #hal.descriptor_type> memref.assume_alignment %1, 64 : memref<518400xi32, #hal.descriptor_type> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<518400xf32, #hal.descriptor_type> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<518400xf32, #hal.descriptor_type> memref.assume_alignment %2, 64 : memref<518400xf32, #hal.descriptor_type> %subview = memref.subview %0[0, 0] [2592000, 1] [1, 1] : memref<2592000x3xf32, #hal.descriptor_type> to memref<2592000xf32, strided<[3]>, #hal.descriptor_type> %workgroup_id_x = hal.interface.workgroup.id[0] : index diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vectorize_with_masking_and_hoist.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vectorize_with_masking_and_hoist.mlir index b6450d346fcce..2e9b3c44c923a 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vectorize_with_masking_and_hoist.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/vectorize_with_masking_and_hoist.mlir @@ -34,15 +34,22 @@ // CHECK-NEXT: %[[INSERT_SLICE:.*]] = tensor.insert_slice %[[OUT_WRITE]] into %[[OUT_SLICE]]{{.*}} : tensor<8x?xf32> into tensor<8x?xf32> // CHECK-NEXT: tensor.insert_slice %[[INSERT_SLICE]] into %[[OUT_TENSOR_1]]{{.*}} : tensor<8x?xf32> into tensor<1024x1024xf32> +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @pipeline() { %c1 = arith.constant 1 : index %c1024 = arith.constant 1024 : index %c16 = arith.constant 16 : index %c8 = arith.constant 8 : index %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x1024xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x1024xf32> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x1024xf32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/verify_linalg_transform_legality.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/verify_linalg_transform_legality.mlir index c6ee662785410..5d7e055a2034c 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/verify_linalg_transform_legality.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/verify_linalg_transform_legality.mlir @@ -1,11 +1,18 @@ // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-llvmcpu-verify-linalg-transform-legality))" %s --verify-diagnostics -split-input-file +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @matmul_123x456xf32_times_456x789xf32_into_123x789xf32_dispatch_0() { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [123, 4, 114], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<123x4x114xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 114, 789], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x114x789xf32> %5 = tensor.empty() : tensor<4x123x789xf32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td index c9467079de117..382de0efd0612 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td @@ -165,7 +165,7 @@ def VectorToWarpExecuteOnLane0Op : Op + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<128xf32> %1 = gpu.thread_id x %2 = arith.cmpi ult, %1, %c1 : index scf.if %2 { @@ -186,7 +186,7 @@ def VectorToWarpExecuteOnLane0Op : Op : vector<128xf32> - %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<128xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<128xf32> %1 = gpu.thread_id x %2 = arith.cmpi ult, %1, %c32 : index // Single-warp guard filters out threads 32-63. @@ -266,7 +266,7 @@ def VectorWarpDistributionOp : Op : vector<128xf32> - %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<128xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<128xf32> %1 = gpu.thread_id x %2 = arith.cmpi ult, %1, %c32 : index // Single-warp guard filters out threads 32-63. @@ -290,7 +290,7 @@ def VectorWarpDistributionOp : Op : vector<128xf32> - %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<128xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<128xf32> %1 = gpu.thread_id x %2 = arith.cmpi ult, %1, %c32 : index // Single-warp guard filters out threads 32-63. diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir index ad80fdf2498b9..6c736077d060a 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir @@ -19,13 +19,13 @@ #pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer> + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> ]> ]> hal.executable public @main_0_dispatch_0 { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32 ordinal(0) layout(#pipeline_layout) - attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { + hal.executable.export public @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index @@ -54,9 +54,9 @@ hal.executable public @main_0_dispatch_0 { }>} { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x1280xf16> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<10240x1280xf16> %5 = tensor.empty() : tensor<2048x10240xf32> @@ -95,13 +95,13 @@ hal.executable public @main_0_dispatch_0 { #pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer> + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> ]> ]> hal.executable public @main_0_dispatch_0 { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32 ordinal(0) layout(#pipeline_layout) - attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { + hal.executable.export public @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index @@ -131,9 +131,9 @@ hal.executable public @main_0_dispatch_0 { }>} { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x1280xf16> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<10240x1280xf16> %5 = tensor.empty() : tensor<2048x10240xf32> @@ -167,13 +167,13 @@ hal.executable public @main_0_dispatch_0 { #pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer> + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> ]> ]> hal.executable public @main_0_dispatch_0 { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32 ordinal(0) layout(#pipeline_layout) - attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { + hal.executable.export public @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index @@ -192,9 +192,9 @@ hal.executable public @main_0_dispatch_0 { }>} { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x1280xf16> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<10240x1280xf16> %5 = tensor.empty() : tensor<2048x10240xf32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir index 3aea05886f5ff..08ef0e89d2cc3 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir @@ -13,29 +13,34 @@ // CHECK-SAME: intrinsic = #iree_gpu.mma_layout // CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 4 +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d4)> #map1 = affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d4)> #map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)> -module { - func.func @expanded_matmul_transpose_b() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 64, 2048], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x64x2048xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 2048], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<10x64x2048xf16> - %5 = tensor.empty() : tensor<2x10x64x64xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2x10x64x64xf16>) -> tensor<2x10x64x64xf16> - %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<2x64x2048xf16>, tensor<10x64x2048xf16>) outs(%6 : tensor<2x10x64x64xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %8 = arith.mulf %in, %in_0 : f16 - %9 = arith.addf %8, %out : f16 - linalg.yield %9 : f16 - } -> tensor<2x10x64x64xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 64], strides = [1, 1, 1, 1] : tensor<2x10x64x64xf16> -> !flow.dispatch.tensor> - return - } +func.func @expanded_matmul_transpose_b() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 64, 2048], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x64x2048xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 64, 2048], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<10x64x2048xf16> + %5 = tensor.empty() : tensor<2x10x64x64xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2x10x64x64xf16>) -> tensor<2x10x64x64xf16> + %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<2x64x2048xf16>, tensor<10x64x2048xf16>) outs(%6 : tensor<2x10x64x64xf16>) { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %8 = arith.mulf %in, %in_0 : f16 + %9 = arith.addf %8, %out : f16 + linalg.yield %9 : f16 + } -> tensor<2x10x64x64xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 64], strides = [1, 1, 1, 1] : tensor<2x10x64x64xf16> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @expanded_matmul_transpose_b() @@ -49,21 +54,26 @@ module { // CHECK-SAME: intrinsic = #iree_gpu.mma_layout // CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2 -module { - func.func @conv_nhwc() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 258, 514, 768], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x258x514x768xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 768, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x768x256xf16> - %5 = tensor.empty() : tensor<2x256x512x256xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x256x512x256xf32>) -> tensor<2x256x512x256xf32> - %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<2x258x514x768xf16>, tensor<3x3x768x256xf16>) outs(%6 : tensor<2x256x512x256xf32>) -> tensor<2x256x512x256xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [2, 256, 512, 256], strides = [1, 1, 1, 1] : tensor<2x256x512x256xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @conv_nhwc() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 258, 514, 768], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x258x514x768xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 768, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x768x256xf16> + %5 = tensor.empty() : tensor<2x256x512x256xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x256x512x256xf32>) -> tensor<2x256x512x256xf32> + %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<2x258x514x768xf16>, tensor<3x3x768x256xf16>) outs(%6 : tensor<2x256x512x256xf32>) -> tensor<2x256x512x256xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [2, 256, 512, 256], strides = [1, 1, 1, 1] : tensor<2x256x512x256xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @conv_nhwc() @@ -71,6 +81,13 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #target = #iree_gpu.target> #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #target}> -module { - func.func @matmul_256x256x256() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xf16> - %5 = tensor.empty() : tensor<256x256xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<256x256xf32>) -> tensor<256x256xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<256x256xf16>, tensor<256x256xf16>) outs(%6 : tensor<256x256xf32>) -> tensor<256x256xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor> - return - } +func.func @matmul_256x256x256() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xf16> + %5 = tensor.empty() : tensor<256x256xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<256x256xf32>) -> tensor<256x256xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<256x256xf16>, tensor<256x256xf16>) outs(%6 : tensor<256x256xf32>) -> tensor<256x256xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : tensor<256x256xf32> -> !flow.dispatch.tensor> + return } // Check that we do not use the distribute pipeline if there are no supported @@ -107,21 +122,26 @@ module { // CHECK-SAME: intrinsic = #iree_gpu.mma_layout // CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2 -module { - func.func @mfma_matmul_1024x1024x1024() { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x1024xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x1024xf16> - %5 = tensor.empty() : tensor<1024x1024xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @mfma_matmul_1024x1024x1024() { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x1024xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x1024xf16> + %5 = tensor.empty() : tensor<1024x1024xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @mfma_matmul_1024x1024x1024() @@ -135,39 +155,44 @@ module { // CHECK-SAME: intrinsic = #iree_gpu.mma_layout // CHECK-SAME: subgroup_m_count = 2, subgroup_n_count = 2 +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d5, d2 + d6, d3 + d7, d8)> #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d1, d5, d6, d7, d4, d8)> #map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d2, d3, d4)> #map3 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)> -module { - func.func @conv_nchwc() { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0, 0], sizes = [2, 20, 34, 34, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x20x34x34x64xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 20, 3, 3, 160, 64], strides = [1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x20x3x3x160x64xf16> - %5 = tensor.empty() : tensor<2x8x32x32x160xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x8x32x32x160xf32>) -> tensor<2x8x32x32x160xf32> - %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction", "reduction"]} ins(%3, %4 : tensor<2x20x34x34x64xf16>, tensor<8x20x3x3x160x64xf16>) outs(%6 : tensor<2x8x32x32x160xf32>) attrs = {lowering_config = #config} { - ^bb0(%in: f16, %in_0: f16, %out: f32): - %10 = arith.extf %in : f16 to f32 - %11 = arith.extf %in_0 : f16 to f32 - %12 = arith.mulf %10, %11 : f32 - %13 = arith.addf %out, %12 : f32 - linalg.yield %13 : f32 - } -> tensor<2x8x32x32x160xf32> - %8 = tensor.empty() : tensor<2x8x32x32x160xf16> - %9 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x8x32x32x160xf32>) outs(%8 : tensor<2x8x32x32x160xf16>) { - ^bb0(%in: f32, %out: f16): - %10 = arith.truncf %in : f32 to f16 - linalg.yield %10 : f16 - } -> tensor<2x8x32x32x160xf16> - flow.dispatch.tensor.store %9, %2, offsets = [0, 0, 0, 0, 0], sizes = [2, 8, 32, 32, 160], strides = [1, 1, 1, 1, 1] : tensor<2x8x32x32x160xf16> -> !flow.dispatch.tensor> - return - } +func.func @conv_nchwc() { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0, 0], sizes = [2, 20, 34, 34, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x20x34x34x64xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 20, 3, 3, 160, 64], strides = [1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x20x3x3x160x64xf16> + %5 = tensor.empty() : tensor<2x8x32x32x160xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x8x32x32x160xf32>) -> tensor<2x8x32x32x160xf32> + %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction", "reduction"]} ins(%3, %4 : tensor<2x20x34x34x64xf16>, tensor<8x20x3x3x160x64xf16>) outs(%6 : tensor<2x8x32x32x160xf32>) attrs = {lowering_config = #config} { + ^bb0(%in: f16, %in_0: f16, %out: f32): + %10 = arith.extf %in : f16 to f32 + %11 = arith.extf %in_0 : f16 to f32 + %12 = arith.mulf %10, %11 : f32 + %13 = arith.addf %out, %12 : f32 + linalg.yield %13 : f32 + } -> tensor<2x8x32x32x160xf32> + %8 = tensor.empty() : tensor<2x8x32x32x160xf16> + %9 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<2x8x32x32x160xf32>) outs(%8 : tensor<2x8x32x32x160xf16>) { + ^bb0(%in: f32, %out: f16): + %10 = arith.truncf %in : f32 to f16 + linalg.yield %10 : f16 + } -> tensor<2x8x32x32x160xf16> + flow.dispatch.tensor.store %9, %2, offsets = [0, 0, 0, 0, 0], sizes = [2, 8, 32, 32, 160], strides = [1, 1, 1, 1, 1] : tensor<2x8x32x32x160xf16> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @conv_nchwc() @@ -181,21 +206,26 @@ module { // WMMA-SAME: intrinsic = #iree_gpu.mma_layout // WMMA-SAME: subgroup_m_count = 2, subgroup_n_count = 2 -module { - func.func @wmma_matmul_1024x1024x1024() { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x1024xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x1024xf16> - %5 = tensor.empty() : tensor<1024x1024xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @wmma_matmul_1024x1024x1024() { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x1024xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x1024xf16> + %5 = tensor.empty() : tensor<1024x1024xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor> + return } // WMMA-LABEL: func.func @wmma_matmul_1024x1024x1024() @@ -209,21 +239,26 @@ module { // CHECK-SAME: intrinsic = #iree_gpu.mma_layout // CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 1 -module { - func.func @unaligned_mk_batch_matmul() { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1281], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x968x1281xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1281, 1281], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x1281x1281xf16> - %5 = tensor.empty() : tensor<64x968x1281xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1281xf16>) -> tensor<64x968x1281xf16> - %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1281xf16>, tensor<64x1281x1281xf16>) outs(%6 : tensor<64x968x1281xf16>) -> tensor<64x968x1281xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1281], strides = [1, 1, 1] : tensor<64x968x1281xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @unaligned_mk_batch_matmul() { + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1281], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x968x1281xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1281, 1281], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x1281x1281xf16> + %5 = tensor.empty() : tensor<64x968x1281xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x1281xf16>) -> tensor<64x968x1281xf16> + %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x1281xf16>, tensor<64x1281x1281xf16>) outs(%6 : tensor<64x968x1281xf16>) -> tensor<64x968x1281xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 1281], strides = [1, 1, 1] : tensor<64x968x1281xf16> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @unaligned_mk_batch_matmul() // CHECK: linalg.batch_matmul @@ -237,21 +272,26 @@ module { // CHECK-SAME: intrinsic = #iree_gpu.mma_layout // CHECK-SAME: subgroup_m_count = 1, subgroup_n_count = 4 -module { - func.func @unaligned_m_batch_matmul_64x72x1280x1280() { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 72, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x72x1280xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x1280x1280xf16> - %5 = tensor.empty() : tensor<64x72x1280xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x72x1280xf16>) -> tensor<64x72x1280xf16> - %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x72x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x72x1280xf16>) -> tensor<64x72x1280xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 72, 1280], strides = [1, 1, 1] : tensor<64x72x1280xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @unaligned_m_batch_matmul_64x72x1280x1280() { + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 72, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x72x1280xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1280, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x1280x1280xf16> + %5 = tensor.empty() : tensor<64x72x1280xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x72x1280xf16>) -> tensor<64x72x1280xf16> + %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x72x1280xf16>, tensor<64x1280x1280xf16>) outs(%6 : tensor<64x72x1280xf16>) -> tensor<64x72x1280xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 72, 1280], strides = [1, 1, 1] : tensor<64x72x1280xf16> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @unaligned_m_batch_matmul_64x72x1280x1280() // CHECK: linalg.batch_matmul @@ -259,21 +299,26 @@ module { // ----- -module { - func.func @narrow_n_batch_matmul_64x968x4x320_f16() { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 320], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x968x320xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 320, 4], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x320x4xf16> - %5 = tensor.empty() : tensor<64x968x4xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x4xf16>) -> tensor<64x968x4xf16> - %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x320xf16>, tensor<64x320x4xf16>) outs(%6 : tensor<64x968x4xf16>) -> tensor<64x968x4xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 4], strides = [1, 1, 1] : tensor<64x968x4xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @narrow_n_batch_matmul_64x968x4x320_f16() { + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 320], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x968x320xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 320, 4], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x320x4xf16> + %5 = tensor.empty() : tensor<64x968x4xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x968x4xf16>) -> tensor<64x968x4xf16> + %7 = linalg.batch_matmul ins(%3, %4 : tensor<64x968x320xf16>, tensor<64x320x4xf16>) outs(%6 : tensor<64x968x4xf16>) -> tensor<64x968x4xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [64, 968, 4], strides = [1, 1, 1] : tensor<64x968x4xf16> -> !flow.dispatch.tensor> + return } // Check that we don't support LLVMGPUPadAndVectorDistribute for narrow N/M atm. // CHECK-NOT: #iree_codegen.translation_info> - %8 = flow.dispatch.workload.ordinal %6, 0 : index - %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%8} - %10 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%8} - %11 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [%8, 256], strides = [1, 1] : !flow.dispatch.tensor>{%8} -> tensor - %12 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xf16> - %13 = tensor.empty(%8) : tensor - %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor) -> tensor - %15 = linalg.matmul ins(%11, %12 : tensor, tensor<256x256xf16>) outs(%14 : tensor) -> tensor - flow.dispatch.tensor.store %15, %10, offsets = [0, 0], sizes = [%8, 256], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%8} - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_dynamic_dim() { + %c0 = arith.constant 0 : index + %c32_i64 = arith.constant 32 : i64 + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = arith.extui %0 : i32 to i64 + %3 = arith.extui %1 : i32 to i64 + %4 = arith.shli %3, %c32_i64 : i64 + %5 = arith.ori %2, %4 : i64 + %6 = arith.index_castui %5 : i64 to index + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %8 = flow.dispatch.workload.ordinal %6, 0 : index + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%8} + %10 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%8} + %11 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [%8, 256], strides = [1, 1] : !flow.dispatch.tensor>{%8} -> tensor + %12 = flow.dispatch.tensor.load %7, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xf16> + %13 = tensor.empty(%8) : tensor + %14 = linalg.fill ins(%cst : f32) outs(%13 : tensor) -> tensor + %15 = linalg.matmul ins(%11, %12 : tensor, tensor<256x256xf16>) outs(%14 : tensor) -> tensor + flow.dispatch.tensor.store %15, %10, offsets = [0, 0], sizes = [%8, 256], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%8} + return } // Check that we have unhandled dynamic dimension. // CHECK-NOT: iree_codegen.translation_info, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> func.func @attention_20x4096x64x4096x64() { %cst = arith.constant 1.250000e-01 : f16 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<20x4096x64xf16> %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<20x4096x64xf16> %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<20x4096x64xf16> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/lowering_scalar_dispatch.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/lowering_scalar_dispatch.mlir index 2564419b3cc96..5c0ba99d8603d 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/lowering_scalar_dispatch.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/lowering_scalar_dispatch.mlir @@ -16,8 +16,8 @@ hal.executable @scalar_dispatch { %c0 = arith.constant 0 : index %c6364136223846793005_i64 = arith.constant 6364136223846793005 : i64 %c1442695040888963407_i64 = arith.constant 1442695040888963407 : i64 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor %extracted = tensor.extract %2[] : tensor %3 = arith.muli %extracted, %c6364136223846793005_i64 : i64 @@ -32,8 +32,8 @@ hal.executable @scalar_dispatch { // CHECK-LABEL: func.func @scalar_dispatch() // CHECK-SAME: translation_info = #iree_codegen.translation_info -// CHECK: %[[SPAN0:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK: %[[SPAN1:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[SPAN0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK: %[[SPAN1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK: memref.load %[[SPAN0]][] : memref> // CHECK: arith.muli {{.+}} : i64 // CHECK: arith.addi {{.+}} : i64 diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir index 95463a872aa23..5d404ba7c7bc9 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir @@ -4,14 +4,14 @@ #pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer> + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> ]> ]> #config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 4], thread = [8, 4]}> hal.executable public @main { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @matmul_transpose_b ordinal(0) layout(#pipeline_layout) - attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { + hal.executable.export public @matmul_transpose_b ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index @@ -21,9 +21,9 @@ hal.executable public @main { attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x1280xf16> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<10240x1280xf16> %5 = tensor.empty() : tensor<2048x10240xf32> @@ -42,9 +42,9 @@ hal.executable public @main { // analysis should be able to simplify the below to just two barriers. // CHECK-LABEL: func @matmul_transpose_b -// CHECK-DAG: %[[B0:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[B1:.+]] = hal.interface.binding.subspan set(0) binding(1) -// CHECK-DAG: %[[B2:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK-DAG: %[[B0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[B1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-DAG: memref.alloc() : memref<64x4xf16, #gpu.address_space> // CHECK-DAG: memref.alloc() : memref<64x4xf16, #gpu.address_space> // CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c1280 step %c4 {{.*}} -> (vector<8x4xf32>) @@ -68,14 +68,14 @@ hal.executable public @main { #pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer> + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> ]> ]> #config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 2], subgroup = [2, 2], mma_kind = #iree_gpu.mma_layout}> hal.executable public @main { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @matmul_transpose_b_mfma ordinal(0) layout(#pipeline_layout) - attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { + hal.executable.export public @matmul_transpose_b_mfma ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index @@ -85,9 +85,9 @@ hal.executable public @main { attributes {translation_info = #iree_codegen.translation_info} { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x1280xf16> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<10240x1280xf16> %5 = tensor.empty() : tensor<2048x10240xf32> @@ -103,9 +103,9 @@ hal.executable public @main { } // CHECK-LABEL: func @matmul_transpose_b_mfma -// CHECK-DAG: %[[B0:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[B1:.+]] = hal.interface.binding.subspan set(0) binding(1) -// CHECK-DAG: %[[B2:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK-DAG: %[[B0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[B1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-DAG: memref.alloc() : memref<64x32xf16, #gpu.address_space> // CHECK-DAG: memref.alloc() : memref<64x32xf16, #gpu.address_space> // CHECK: %[[LOOP:.+]] = scf.for %[[IV:.+]] = %c0 to %c80 step %c2 {{.*}} -> (vector<2x2x4x1xf32>) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir index 5cb3e82efc433..629c41c72ef29 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir @@ -15,7 +15,8 @@ #pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer> + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> ]> ]> hal.executable @matmul_256x256x256_f16_f32 { @@ -29,9 +30,9 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { func.func @matmul_256x256x256_f16_f32() { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xf16> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xf16> %5 = tensor.empty() : tensor<256x256xf32> @@ -65,7 +66,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { #pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer> + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> ]> ]> hal.executable @matmul_256x256x256_f16_f16 { @@ -79,9 +81,9 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { func.func @matmul_256x256x256_f16_f16() { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xf16> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xf16> %5 = tensor.empty() : tensor<256x256xf16> @@ -113,7 +115,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { #pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer> + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> ]> ]> hal.executable @expanded_matmul_transpose_b_executable { @@ -127,11 +130,11 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { func.func @expanded_matmul_transpose_b() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 64, 2048], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x64x2048xf16> @@ -184,7 +187,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { #pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer> + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> ]> ]> hal.executable @matmul_256x256x256_f8_f32 { @@ -198,9 +202,9 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { func.func @matmul_256x256x256_f8_f32() { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xf8E4M3FNUZ> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xf8E4M3FNUZ> %5 = tensor.empty() : tensor<256x256xf32> @@ -234,7 +238,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { #pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer> + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> ]> ]> hal.executable @matmul_256x256x256_i8_i32 { @@ -248,9 +253,9 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { func.func @matmul_256x256x256_i8_i32() { %cst = arith.constant 0 : i32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xi8> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xi8> %5 = tensor.empty() : tensor<256x256xi32> @@ -284,7 +289,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { #pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer> + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> ]> ]> hal.executable @matmul_transpose_b_256x256x256_i8_i32 { @@ -298,9 +304,9 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { func.func @matmul_transpose_b_256x256x256_i8_i32() { %cst = arith.constant 0 : i32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xi8> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xi8> %5 = tensor.empty() : tensor<256x256xi32> @@ -327,13 +333,13 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // CHECK-COUNT-32: amdgpu.mfma {{.*}} {blocks = 1 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32} blgp = none : vector<8xi8>, vector<8xi8>, vector<4xi32> // CHECK-COUNT-4: vector.transfer_write {{.+}} {in_bounds = [true, true]} : vector<4x1xi32>, memref<256x256xi32, #hal.descriptor_type> - // ----- #pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer> + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> ]> ]> hal.executable @conv_nhwc_dispatch_0 { @@ -347,9 +353,9 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { func.func @conv_nhwc() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 258, 514, 768], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x258x514x768xf16> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 768, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x768x256xf16> %5 = tensor.empty() : tensor<2x256x512x256xf32> @@ -371,19 +377,17 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb"> #map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)> #map1 = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)> #map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)> -#pipeline_layout = #hal.pipeline.layout< - push_constants = 2, - sets = [ - <0, bindings = [ - <0, storage_buffer, ReadOnly>, - <1, storage_buffer, ReadOnly>, - <2, storage_buffer> - ]> - ]> hal.executable public @main_dispatch_expanded_matmul { hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { hal.executable.export public @generic_2x1024x20x64x1280_f16 ordinal(0) layout(#pipeline_layout) attributes { @@ -400,13 +404,13 @@ hal.executable public @main_dispatch_expanded_matmul { func.func @generic_2x1024x20x64x1280_f16() { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = arith.index_castui %0 : i32 to index %3 = arith.index_castui %1 : i32 to index - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%2) flags(ReadOnly) : !flow.dispatch.tensor> - %5 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %6 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%3) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%2) flags(ReadOnly) : !flow.dispatch.tensor> + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%3) : !flow.dispatch.tensor> %7 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0], sizes = [2, 1024, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1024x1280xf16> %8 = flow.dispatch.tensor.load %5, offsets = [0, 0, 0], sizes = [20, 64, 1280], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<20x64x1280xf16> %9 = tensor.empty() : tensor<2x1024x20x64xf16> @@ -450,7 +454,8 @@ hal.executable public @main_dispatch_expanded_matmul { #pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer> + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> ]> ]> hal.executable @matmul_256x256x256_f16_f32 { @@ -464,9 +469,9 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { func.func @matmul_256x256x256_f16_f32() { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xf16> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x256xf16> %5 = tensor.empty() : tensor<256x256xf32> @@ -501,7 +506,8 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { #pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<1, storage_buffer> + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> ]> ]> hal.executable @unaligned_mk_batch_matmul_64x978x1281x1281_f16_f16 { @@ -515,9 +521,9 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { func.func @unaligned_nk_batch_matmul() { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [64, 968, 1281], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x968x1281xf16> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [64, 1281, 1281], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x1281x1281xf16> %5 = tensor.empty() : tensor<64x968x1281xf16> @@ -542,9 +548,9 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // CHECK-DAG: %[[RHS_SHARED_SUB:.+]] = memref.subview %[[RHS_SHARED]][0, 0, 0] [1, 16, 16] [1, 1, 1] // CHECK-DAG: %[[LHS_SHARED:.+]] = memref.alloc() : memref<1x16x20xf16, #gpu.address_space> // CHECK-DAG: %[[LHS_SHARED_SUB:.+]] = memref.subview %[[LHS_SHARED]][0, 0, 0] [1, 16, 16] [1, 1, 1] -// CHECK-DAG: %[[LHS_GLOBAL:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1281xf16, #hal.descriptor_type> -// CHECK-DAG: %[[RHS_GLOBAL:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1281x1281xf16, #hal.descriptor_type> -// CHECK-DAG: %[[OUT_GLOBAL:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<64x968x1281xf16, #hal.descriptor_type> +// CHECK-DAG: %[[LHS_GLOBAL:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x968x1281xf16, #hal.descriptor_type> +// CHECK-DAG: %[[RHS_GLOBAL:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<64x1281x1281xf16, #hal.descriptor_type> +// CHECK-DAG: %[[OUT_GLOBAL:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) alignment(64) offset(%c0) : memref<64x968x1281xf16, #hal.descriptor_type> // CHECK-DAG: %[[LHS_GLOBAL_SUB:.+]] = memref.subview %[[LHS_GLOBAL]] // CHECK-DAG: %[[RHS_GLOBAL_SUB:.+]] = memref.subview %[[RHS_GLOBAL]] // CHECK: %[[LHS_LOAD:.+]] = vector.transfer_read %[[LHS_GLOBAL_SUB]]{{.+}} {in_bounds = [true, false, false]} @@ -581,14 +587,12 @@ hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { // NOTE: This test is not exhaustive of all possible ways the above condition is breaking, // but rather is an example of a matmul shape from a model that broke our compilation heuristic. -#pipeline_layout = #hal.pipeline.layout< - push_constants = 3, - sets = [ - <0, bindings = [ - <0, storage_buffer, ReadOnly>, - <1, storage_buffer> - ]> +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> ]> +]> hal.executable public @contract_schedule_considering_read_layout { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { hal.executable.export public @contract_schedule_considering_read_layout ordinal(0) layout(#pipeline_layout) { @@ -599,15 +603,15 @@ hal.executable public @contract_schedule_considering_read_layout { builtin.module { func.func @contract_schedule_considering_read_layout() { %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %3 = arith.index_castui %0 : i32 to index %4 = arith.index_castui %1 : i32 to index %5 = arith.index_castui %2 : i32 to index - %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%3) flags(ReadOnly) : !flow.dispatch.tensor> - %7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) flags(ReadOnly) : !flow.dispatch.tensor> - %8 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) : !flow.dispatch.tensor> + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%3) flags(ReadOnly) : !flow.dispatch.tensor> + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%4) flags(ReadOnly) : !flow.dispatch.tensor> + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%5) : !flow.dispatch.tensor> %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0], sizes = [2, 160, 1536], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x160x1536xf16> %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0], sizes = [2, 1536, 1536], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1536x1536xf16> %11 = tensor.empty() : tensor<2x160x1536xf16> @@ -639,7 +643,14 @@ hal.executable public @contract_schedule_considering_read_layout { // ----- -#pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, ReadOnly>, <2, storage_buffer, ReadOnly>, <3, storage_buffer>]>]> +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> hal.executable private @attention_20x4096x64x4096x64 { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { hal.executable.export public @attention_20x4096x64x4096x64 ordinal(0) layout(#pipeline_layout) { @@ -651,10 +662,10 @@ hal.executable private @attention_20x4096x64x4096x64 { func.func @attention_20x4096x64x4096x64() { %cst = arith.constant 1.250000e-01 : f16 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<20x4096x64xf16> %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<20x4096x64xf16> %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [20, 4096, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<20x4096x64xf16> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_warp_reduction.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_warp_reduction.mlir index b6f66c4e7d287..846b30e0ef9f1 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_warp_reduction.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_warp_reduction.mlir @@ -1,9 +1,14 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-rocdl-configuration-pipeline), iree-codegen-linalg-to-rocdl-pipeline2)))" %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable private @warp_reduction { hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @warp_reduction ordinal(0) layout( - #hal.pipeline.layout, <1, storage_buffer>]>]>) { + hal.executable.export public @warp_reduction ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 hal.return %x, %y, %z : index, index, index @@ -12,8 +17,8 @@ hal.executable private @warp_reduction { func.func @warp_reduction() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2x512xf32> %3 = tensor.empty() : tensor<2xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2xf32>) -> tensor<2xf32> @@ -37,10 +42,16 @@ hal.executable private @warp_reduction { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> hal.executable public @main_dispatch_517 { hal.executable.variant public @rocm target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @warp_reduction_large_vector ordinal(0) layout( - #hal.pipeline.layout, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) { + hal.executable.export public @warp_reduction_large_vector ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index @@ -51,9 +62,9 @@ hal.executable public @main_dispatch_517 { %c128 = arith.constant 128 : index %c0 = arith.constant 0 : index %c394240 = arith.constant 394240 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c128) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c394240) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c128) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c394240) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1280xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1280, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1280x1280xf32> %5 = tensor.empty() : tensor<1x1280xf32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir index d76070a41129a..353ec9e3b129f 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir @@ -2,26 +2,32 @@ // RUN: --iree-gpu-test-target=sm_60 --iree-codegen-transform-dialect-library=%p/attention_transform_spec.mlir| \ // RUN: FileCheck --check-prefix=CHECK %s -module { - func.func @_attention_dispatch_0() { - %c0 = arith.constant 0 : index - %cst = arith.constant 1.250000e-01 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [192, 1024, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<192x1024x64xf16> - %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [192, 1024, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<192x1024x64xf16> - %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [192, 1024, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<192x1024x64xf16> - %7 = tensor.empty() : tensor<192x1024x64xf16> - %8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, - affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, - affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} - ins(%4, %5, %6, %cst : tensor<192x1024x64xf16>, tensor<192x1024x64xf16>, tensor<192x1024x64xf16>, f16) outs(%7 : tensor<192x1024x64xf16>) -> tensor<192x1024x64xf16> - flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [192, 1024, 64], strides = [1, 1, 1] : tensor<192x1024x64xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> +func.func @_attention_dispatch_0() { + %c0 = arith.constant 0 : index + %cst = arith.constant 1.250000e-01 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [192, 1024, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<192x1024x64xf16> + %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [192, 1024, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<192x1024x64xf16> + %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [192, 1024, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<192x1024x64xf16> + %7 = tensor.empty() : tensor<192x1024x64xf16> + %8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, + affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, + affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, + affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} + ins(%4, %5, %6, %cst : tensor<192x1024x64xf16>, tensor<192x1024x64xf16>, tensor<192x1024x64xf16>, f16) outs(%7 : tensor<192x1024x64xf16>) -> tensor<192x1024x64xf16> + flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [192, 1024, 64], strides = [1, 1, 1] : tensor<192x1024x64xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0] -> (s0 * 128)> @@ -46,16 +52,16 @@ module { // CHECK-DAG: %[[C1024:.+]] = arith.constant 1024 : index // CHECK-DAG: %[[CST_5:.+]] = arith.constant 0.000000e+00 : f32 // CHECK-dAG: %[[CST_6:.+]] = arith.constant dense<1.802980e-01> : vector<128x64xf16> -// CHECK: %[[D0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) +// CHECK: %[[D0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<192x1024x64xf16, #hal.descriptor_type> // CHECK: memref.assume_alignment %[[D0]], 64 : memref<192x1024x64xf16, #hal.descriptor_type> -// CHECK: %[[D1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) +// CHECK: %[[D1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<192x1024x64xf16, #hal.descriptor_type> // CHECK: memref.assume_alignment %[[D1]], 64 : memref<192x1024x64xf16, #hal.descriptor_type> -// CHECK: %[[D2:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) +// CHECK: %[[D2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<192x1024x64xf16, #hal.descriptor_type> // CHECK: memref.assume_alignment %[[D2]], 64 : memref<192x1024x64xf16, #hal.descriptor_type> -// CHECK: %[[D3:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) +// CHECK: %[[D3:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) alignment(64) // CHECK-SAME: offset(%[[C0]]) : memref<192x1024x64xf16, #hal.descriptor_type> // CHECK: memref.assume_alignment %[[D3]], 64 : memref<192x1024x64xf16, #hal.descriptor_type> // CHECK: %[[WORKGROUP_ID_X:.+]] = hal.interface.workgroup.id[0] : index diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_mfma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_mfma.mlir index ba61ff0ee4769..db4053624c0e7 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_mfma.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_mfma.mlir @@ -2,26 +2,32 @@ // RUN: --iree-gpu-test-target=gfx908 --iree-codegen-transform-dialect-library=%p/attention_mfma_transform_spec.mlir | \ // RUN: FileCheck --check-prefix=CHECK %s -module { - func.func @attention_dispatch_0_attention_16x16384x128xf16() { - %c0 = arith.constant 0 : index - %scale = arith.constant 0.08838834764 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 16384, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x16384x128xf16> - %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 16384, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x16384x128xf16> - %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [16, 16384, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x16384x128xf16> - %7 = tensor.empty() : tensor<16x16384x128xf16> - %8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, - affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, - affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} - ins(%4, %5, %6, %scale : tensor<16x16384x128xf16>, tensor<16x16384x128xf16>, tensor<16x16384x128xf16>, f16) outs(%7 : tensor<16x16384x128xf16>) -> tensor<16x16384x128xf16> - flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [16, 16384, 128], strides = [1, 1, 1] : tensor<16x16384x128xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> +func.func @attention_dispatch_0_attention_16x16384x128xf16() { + %c0 = arith.constant 0 : index + %scale = arith.constant 0.08838834764 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 16384, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x16384x128xf16> + %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 16384, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x16384x128xf16> + %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [16, 16384, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x16384x128xf16> + %7 = tensor.empty() : tensor<16x16384x128xf16> + %8 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, + affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2)>, + affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)>, + affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} + ins(%4, %5, %6, %scale : tensor<16x16384x128xf16>, tensor<16x16384x128xf16>, tensor<16x16384x128xf16>, f16) outs(%7 : tensor<16x16384x128xf16>) -> tensor<16x16384x128xf16> + flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [16, 16384, 128], strides = [1, 1, 1] : tensor<16x16384x128xf16> -> !flow.dispatch.tensor> + return } // CHECK-NOT: vector.contract // CHECK-NOT: iree_vector_ext.to_simd diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir index 474f2ab1c808f..2782383d65c78 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir @@ -1,33 +1,38 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 --pass-pipeline='builtin.module(iree-llvmgpu-select-lowering-strategy)' %s | FileCheck %s // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 --pass-pipeline='builtin.module(iree-llvmgpu-select-lowering-strategy)' %s | FileCheck %s --check-prefix=CDNA3 -module { - func.func @dynamic_batch_matvec() { - %c32_i64 = arith.constant 32 : i64 - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = hal.interface.constant.load[4] : i32 - %5 = arith.index_castui %0 : i32 to index - %6 = arith.index_castui %1 : i32 to index - %7 = arith.index_castui %2 : i32 to index - %8 = arith.index_castui %3 : i32 to index - %9 = arith.index_castui %4 : i32 to index - %10 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%7) : !flow.dispatch.tensor> - %11 = flow.dispatch.workload.ordinal %8, 0 : index - %12 = flow.dispatch.workload.ordinal %9, 1 : index - %13 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor>{%11} - %14 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor>{%12} - %15 = flow.dispatch.tensor.load %13, offsets = [0, 0, 0], sizes = [32, 1, %11], strides = [1, 1, 1] : !flow.dispatch.tensor>{%11} -> tensor<32x1x?xf16> - %16 = flow.dispatch.tensor.load %14, offsets = [0, 0, 0], sizes = [32, %12, 128], strides = [1, 1, 1] : !flow.dispatch.tensor>{%12} -> tensor<32x?x128xf16> - %17 = tensor.empty() : tensor<32x1x128xf16> - %18 = linalg.fill ins(%cst : f16) outs(%17 : tensor<32x1x128xf16>) -> tensor<32x1x128xf16> - %19 = linalg.batch_matmul ins(%15, %16 : tensor<32x1x?xf16>, tensor<32x?x128xf16>) outs(%18 : tensor<32x1x128xf16>) -> tensor<32x1x128xf16> - flow.dispatch.tensor.store %19, %10, offsets = [0, 0, 0], sizes = [32, 1, 128], strides = [1, 1, 1] : tensor<32x1x128xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @dynamic_batch_matvec() { + %c32_i64 = arith.constant 32 : i64 + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 + %5 = arith.index_castui %0 : i32 to index + %6 = arith.index_castui %1 : i32 to index + %7 = arith.index_castui %2 : i32 to index + %8 = arith.index_castui %3 : i32 to index + %9 = arith.index_castui %4 : i32 to index + %10 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%7) : !flow.dispatch.tensor> + %11 = flow.dispatch.workload.ordinal %8, 0 : index + %12 = flow.dispatch.workload.ordinal %9, 1 : index + %13 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor>{%11} + %14 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor>{%12} + %15 = flow.dispatch.tensor.load %13, offsets = [0, 0, 0], sizes = [32, 1, %11], strides = [1, 1, 1] : !flow.dispatch.tensor>{%11} -> tensor<32x1x?xf16> + %16 = flow.dispatch.tensor.load %14, offsets = [0, 0, 0], sizes = [32, %12, 128], strides = [1, 1, 1] : !flow.dispatch.tensor>{%12} -> tensor<32x?x128xf16> + %17 = tensor.empty() : tensor<32x1x128xf16> + %18 = linalg.fill ins(%cst : f16) outs(%17 : tensor<32x1x128xf16>) -> tensor<32x1x128xf16> + %19 = linalg.batch_matmul ins(%15, %16 : tensor<32x1x?xf16>, tensor<32x?x128xf16>) outs(%18 : tensor<32x1x128xf16>) -> tensor<32x1x128xf16> + flow.dispatch.tensor.store %19, %10, offsets = [0, 0, 0], sizes = [32, 1, 128], strides = [1, 1, 1] : tensor<32x1x128xf16> -> !flow.dispatch.tensor> + return } // CDNA3-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -40,30 +45,36 @@ module { // ----- // This test uses special heuristics that needs to check the backend in the #hal.executable.target. + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb"> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d1, d2)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> -module { - func.func @vmt1() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x4096xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32000x4096xf16> - %5 = tensor.empty() : tensor<1x32000xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<1x32000xf16>) -> tensor<1x32000xf16> - %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<1x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<1x32000xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %8 = arith.mulf %in, %in_0 : f16 - %9 = arith.addf %out, %8 : f16 - linalg.yield %9 : f16 - } -> tensor<1x32000xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor> - return - } +func.func @vmt1() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x4096xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32000x4096xf16> + %5 = tensor.empty() : tensor<1x32000xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<1x32000xf16>) -> tensor<1x32000xf16> + %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<1x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<1x32000xf16>) { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %8 = arith.mulf %in, %in_0 : f16 + %9 = arith.addf %out, %8 : f16 + linalg.yield %9 : f16 + } -> tensor<1x32000xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -76,30 +87,36 @@ module { // ----- // This test uses special heuristics that needs to check the backend in the #hal.executable.target. + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb"> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d1, d2)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> -module { - func.func @vmt2() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x4096xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32000x4096xf16> - %5 = tensor.empty() : tensor<1x32000xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<1x32000xf16>) -> tensor<1x32000xf16> - %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<1x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<1x32000xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %8 = arith.mulf %in, %in_0 : f16 - %9 = arith.addf %out, %8 : f16 - linalg.yield %9 : f16 - } -> tensor<1x32000xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor> - return - } +func.func @vmt2() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x4096xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32000x4096xf16> + %5 = tensor.empty() : tensor<1x32000xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<1x32000xf16>) -> tensor<1x32000xf16> + %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<1x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<1x32000xf16>) { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %8 = arith.mulf %in, %in_0 : f16 + %9 = arith.addf %out, %8 : f16 + linalg.yield %9 : f16 + } -> tensor<1x32000xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 32000], strides = [1, 1] : tensor<1x32000xf16> -> !flow.dispatch.tensor> + return } // CDNA3-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -111,43 +128,50 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> #map2 = affine_map<(d0, d1, d2) -> (d1, d2)> #map3 = affine_map<(d0, d1, d2) -> (d0)> -module { - func.func @i4_dequant_matvec() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x32x128xi4> - %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> - %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> - %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x128xf16> - %9 = tensor.empty() : tensor<4096xf16> - %10 = tensor.empty() : tensor<4096x32x128xf16> - %11 = linalg.fill ins(%cst : f16) outs(%9 : tensor<4096xf16>) -> tensor<4096xf16> - %12 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x32x128xi4>, tensor<4096x32xf16>, tensor<4096x32xf16>) outs(%10 : tensor<4096x32x128xf16>) { - ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16): - %14 = arith.extui %in : i4 to i32 - %15 = arith.uitofp %14 : i32 to f16 - %16 = arith.subf %15, %in_1 : f16 - %17 = arith.mulf %16, %in_0 : f16 - linalg.yield %17 : f16 - } -> tensor<4096x32x128xf16> - %13 = linalg.generic {indexing_maps = [#map2, #map, #map3], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<32x128xf16>, tensor<4096x32x128xf16>) outs(%11 : tensor<4096xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %14 = arith.mulf %in, %in_0 : f16 - %15 = arith.addf %14, %out : f16 - linalg.yield %15 : f16 - } -> tensor<4096xf16> - flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf16> -> !flow.dispatch.tensor> - return - } +func.func @i4_dequant_matvec() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x32x128xi4> + %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> + %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> + %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x128xf16> + %9 = tensor.empty() : tensor<4096xf16> + %10 = tensor.empty() : tensor<4096x32x128xf16> + %11 = linalg.fill ins(%cst : f16) outs(%9 : tensor<4096xf16>) -> tensor<4096xf16> + %12 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x32x128xi4>, tensor<4096x32xf16>, tensor<4096x32xf16>) outs(%10 : tensor<4096x32x128xf16>) { + ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16): + %14 = arith.extui %in : i4 to i32 + %15 = arith.uitofp %14 : i32 to f16 + %16 = arith.subf %15, %in_1 : f16 + %17 = arith.mulf %16, %in_0 : f16 + linalg.yield %17 : f16 + } -> tensor<4096x32x128xf16> + %13 = linalg.generic {indexing_maps = [#map2, #map, #map3], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<32x128xf16>, tensor<4096x32x128xf16>) outs(%11 : tensor<4096xf16>) { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %14 = arith.mulf %in, %in_0 : f16 + %15 = arith.addf %14, %out : f16 + linalg.yield %15 : f16 + } -> tensor<4096xf16> + flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf16> -> !flow.dispatch.tensor> + return } // TODO: We should process multiple rows per subgroup. @@ -163,21 +187,26 @@ module { // Send 2xNxK mmt to the warp reduction pipeline. -module { - func.func @skinny_mmt() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2x4096xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32000x4096xf16> - %5 = tensor.empty() : tensor<2x32000xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2x32000xf16>) -> tensor<2x32000xf16> - %7 = linalg.matmul_transpose_b ins(%3, %4 : tensor<2x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<2x32000xf16>) -> tensor<2x32000xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2, 32000], strides = [1, 1] : tensor<2x32000xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @skinny_mmt() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2x4096xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32000x4096xf16> + %5 = tensor.empty() : tensor<2x32000xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2x32000xf16>) -> tensor<2x32000xf16> + %7 = linalg.matmul_transpose_b ins(%3, %4 : tensor<2x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<2x32000xf16>) -> tensor<2x32000xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2, 32000], strides = [1, 1] : tensor<2x32000xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -191,21 +220,26 @@ module { // Send Mx2xK mmt to the warp reduction pipeline. -module { - func.func @skinny_mmt() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2x4096xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32000x4096xf16> - %5 = tensor.empty() : tensor<32000x2xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<32000x2xf16>) -> tensor<32000x2xf16> - %7 = linalg.matmul_transpose_b ins(%4, %3 : tensor<32000x4096xf16>, tensor<2x4096xf16>) outs(%6 : tensor<32000x2xf16>) -> tensor<32000x2xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [32000, 2], strides = [1, 1] : tensor<32000x2xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @skinny_mmt() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2x4096xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32000x4096xf16> + %5 = tensor.empty() : tensor<32000x2xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<32000x2xf16>) -> tensor<32000x2xf16> + %7 = linalg.matmul_transpose_b ins(%4, %3 : tensor<32000x4096xf16>, tensor<2x4096xf16>) outs(%6 : tensor<32000x2xf16>) -> tensor<32000x2xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [32000, 2], strides = [1, 1] : tensor<32000x2xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -217,29 +251,34 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d1, d2)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> -module { - func.func @not_vmt() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [5, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<5x4096xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32000x4096xf16> - %5 = tensor.empty() : tensor<5x32000xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<5x32000xf16>) -> tensor<5x32000xf16> - %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<5x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<5x32000xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %8 = arith.mulf %in, %in_0 : f16 - %9 = arith.addf %out, %8 : f16 - linalg.yield %9 : f16 - } -> tensor<5x32000xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [5, 32000], strides = [1, 1] : tensor<5x32000xf16> -> !flow.dispatch.tensor> - return - } +func.func @not_vmt() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [5, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<5x4096xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32000x4096xf16> + %5 = tensor.empty() : tensor<5x32000xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<5x32000xf16>) -> tensor<5x32000xf16> + %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<5x4096xf16>, tensor<32000x4096xf16>) outs(%6 : tensor<5x32000xf16>) { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %8 = arith.mulf %in, %in_0 : f16 + %9 = arith.addf %out, %8 : f16 + linalg.yield %9 : f16 + } -> tensor<5x32000xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [5, 32000], strides = [1, 1] : tensor<5x32000xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_winograd.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_winograd.mlir index 59f7e3eb414b8..8db080c0d5eaa 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_winograd.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_winograd.mlir @@ -1,16 +1,20 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 --pass-pipeline='builtin.module(iree-llvmgpu-select-lowering-strategy)' %s | FileCheck %s -module { - func.func @winograd_filter_transform() { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [3, 3, 64, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x64x128xf32> - %3 = tensor.empty() : tensor<8x8x64x128xf32> - %4 = iree_linalg_ext.winograd.filter_transform output_tile_size(6) kernel_size(3) kernel_dimensions([0, 1]) ins(%2 : tensor<3x3x64x128xf32>) outs(%3 : tensor<8x8x64x128xf32>) -> tensor<8x8x64x128xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [8, 8, 64, 128], strides = [1, 1, 1, 1] : tensor<8x8x64x128xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @winograd_filter_transform() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [3, 3, 64, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x64x128xf32> + %3 = tensor.empty() : tensor<8x8x64x128xf32> + %4 = iree_linalg_ext.winograd.filter_transform output_tile_size(6) kernel_size(3) kernel_dimensions([0, 1]) ins(%2 : tensor<3x3x64x128xf32>) outs(%3 : tensor<8x8x64x128xf32>) -> tensor<8x8x64x128xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [8, 8, 64, 128], strides = [1, 1, 1, 1] : tensor<8x8x64x128xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -22,17 +26,21 @@ module { // ----- -module { - func.func @winograd_input_transform() { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 34, 34, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x34x34x128xf16> - %3 = tensor.empty() : tensor<8x8x2x6x6x128xf16> - %4 = iree_linalg_ext.winograd.input_transform output_tile_size(6) kernel_size(3) image_dimensions([1, 2]) ins(%2 : tensor<2x34x34x128xf16>) outs(%3 : tensor<8x8x2x6x6x128xf16>) -> tensor<8x8x2x6x6x128xf16> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 8, 2, 6, 6, 128], strides = [1, 1, 1, 1, 1, 1] : tensor<8x8x2x6x6x128xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @winograd_input_transform() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 34, 34, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x34x34x128xf16> + %3 = tensor.empty() : tensor<8x8x2x6x6x128xf16> + %4 = iree_linalg_ext.winograd.input_transform output_tile_size(6) kernel_size(3) image_dimensions([1, 2]) ins(%2 : tensor<2x34x34x128xf16>) outs(%3 : tensor<8x8x2x6x6x128xf16>) -> tensor<8x8x2x6x6x128xf16> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 8, 2, 6, 6, 128], strides = [1, 1, 1, 1, 1, 1] : tensor<8x8x2x6x6x128xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -44,17 +52,21 @@ module { // ----- -module { - func.func @winograd_output_transform() { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 8, 2, 6, 6, 128], strides = [1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x8x2x6x6x128xf16> - %3 = tensor.empty() : tensor<2x36x36x128xf16> - %4 = iree_linalg_ext.winograd.output_transform output_tile_size(6) kernel_size(3) image_dimensions([1, 2]) ins(%2 : tensor<8x8x2x6x6x128xf16>) outs(%3 : tensor<2x36x36x128xf16>) -> tensor<2x36x36x128xf16> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [2, 36, 36, 128], strides = [1, 1, 1, 1] : tensor<2x36x36x128xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @winograd_output_transform() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 8, 2, 6, 6, 128], strides = [1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x8x2x6x6x128xf16> + %3 = tensor.empty() : tensor<2x36x36x128xf16> + %4 = iree_linalg_ext.winograd.output_transform output_tile_size(6) kernel_size(3) image_dimensions([1, 2]) ins(%2 : tensor<8x8x2x6x6x128xf16>) outs(%3 : tensor<2x36x36x128xf16>) -> tensor<2x36x36x128xf16> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [2, 36, 36, 128], strides = [1, 1, 1, 1] : tensor<2x36x36x128xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_cuda.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_cuda.mlir index e0a67bb4e54e4..de7bbb411ce38 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_cuda.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_cuda.mlir @@ -3,7 +3,13 @@ // RUN: %s | FileCheck %s #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb"> -#pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]> +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> hal.executable private @conv2d_1x230x230x3_7x7x3x64_dispatch_0 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @conv2d_1x230x230x3_7x7x3x64 ordinal(0) layout(#pipeline_layout) { @@ -15,9 +21,9 @@ hal.executable private @conv2d_1x230x230x3_7x7x3x64_dispatch_0 { func.func @conv2d_1x230x230x3_7x7x3x64() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 230, 230, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x230x230x3xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [7, 7, 3, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<7x7x3x64xf32> %5 = tensor.empty() : tensor<1x112x112x64xf32> @@ -44,7 +50,13 @@ hal.executable private @conv2d_1x230x230x3_7x7x3x64_dispatch_0 { // ----- #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb"> -#pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]> +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> hal.executable private @conv_nchw_dispatch_0 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @conv_nchw ordinal(0) layout(#pipeline_layout) { @@ -56,9 +68,9 @@ hal.executable private @conv_nchw_dispatch_0 { func.func @conv_nchw() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 230, 230, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x4x66x66xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [7, 7, 3, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<320x4x3x3xf32> %5 = tensor.empty() : tensor<2x320x64x64xf32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_rocm.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_rocm.mlir index fbc4faa1b2b64..6035b4bd73536 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_rocm.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_rocm.mlir @@ -2,25 +2,19 @@ // RUN: --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target,canonicalize)))))' \ // RUN: %s | FileCheck %s -#layout = #hal.pipeline.layout, - <1, storage_buffer, ReadOnly>, - <2, storage_buffer, ReadOnly>, - <3, storage_buffer> - ]> +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> ]> +]> hal.executable private @conv_nchw_dispatch_1 { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @conv_2d_nchw_fchw_2x320x64x64x320x3x3_f16 ordinal(0) layout(#layout) attributes { - hal.interface.bindings = [ - #hal.interface.binding<0, 0>, - #hal.interface.binding<0, 1>, - #hal.interface.binding<0, 2>, - #hal.interface.binding<0, 3> - ], - translation_info = #iree_codegen.translation_info} { + hal.executable.export public @conv_2d_nchw_fchw_2x320x64x64x320x3x3_f16 ordinal(0) layout(#pipeline_layout) attributes { + translation_info = #iree_codegen.translation_info + } { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index @@ -29,10 +23,10 @@ hal.executable private @conv_nchw_dispatch_1 { func.func @conv_2d_nchw_fchw_2x320x64x64x320x3x3_f16() { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 320, 130, 130], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x320x130x130xf16> %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [320, 320, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<320x320x3x3xf16> %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [320], strides = [1] : !flow.dispatch.tensor> -> tensor<320xf16> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_nvvm.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_nvvm.mlir index 78d922dae86fb..0bd40d0d034e9 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_nvvm.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_nvvm.mlir @@ -17,9 +17,9 @@ hal.executable @abs_ex_dispatch_0 { func.func @abs_ex_dispatch_0() { %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index - %0 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) offset(%c128) flags(ReadOnly) : memref<16xf32, strided<[1], offset: 32>> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<16xi32> - %2 = hal.interface.binding.subspan set(1) binding(2) type(storage_buffer) : memref<16xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) offset(%c128) flags(ReadOnly) : memref<16xf32, strided<[1], offset: 32>> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<16xi32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(1) binding(2) : memref<16xf32> %3 = gpu.block_id x %4 = gpu.block_dim x %5 = gpu.thread_id x @@ -62,13 +62,13 @@ hal.executable @abs_dynamic { %c3 = arith.constant 3 : index %c5 = arith.constant 5 : index %c7 = arith.constant 7 : index - %o = hal.interface.constant.load[0] : index - %d0 = hal.interface.constant.load[1] : index - %d1 = hal.interface.constant.load[2] : index - %d2 = hal.interface.constant.load[3] : index - %0 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) offset(%o) : memref>{%d0, %d1, %d2} - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref{%d0, %d1, %d2} - %2 = hal.interface.binding.subspan set(1) binding(2) type(storage_buffer) : memref{%d0, %d1, %d2} + %o = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %d0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %d1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %d2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) offset(%o) : memref>{%d0, %d1, %d2} + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref{%d0, %d1, %d2} + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(1) binding(2) : memref{%d0, %d1, %d2} %9 = memref.load %0[%c3, %c5, %c7] : memref> %10 = memref.load %1[%c3, %c5, %c7] : memref %11 = arith.sitofp %10 : i32 to f32 @@ -121,8 +121,8 @@ hal.executable @dead_symbol { func.func @dead_symbol() { %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16xi32> - %2 = hal.interface.binding.subspan set(1) binding(2) type(storage_buffer) : memref<16xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<16xi32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(1) binding(2) : memref<16xf32> %3 = gpu.block_id x %4 = gpu.block_dim x %5 = gpu.thread_id x @@ -159,9 +159,9 @@ hal.executable @mixed_type { func.func @mixed_type() { %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c128) : memref<16xf32, strided<[1], offset: 4>> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) : memref<16xi32> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%c128) : memref<16xf32, strided<[1], offset: 4>> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%c0) : memref<16xi32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<16xf32> %3 = gpu.block_id x %4 = gpu.block_dim x %5 = gpu.thread_id x @@ -291,10 +291,11 @@ hal.executable @shared_memory_lowering_aligned_alloc { #pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<4, storage_buffer> + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> ]>, #hal.descriptor_set.layout<1, bindings = [ - #hal.descriptor_set.binding<2, storage_buffer> + #hal.descriptor_set.binding<3, storage_buffer> ]> ]> hal.executable @check_not_readonly { @@ -304,13 +305,13 @@ hal.executable @check_not_readonly { func.func @check_not_readonly() { %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<16xi32> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c128) flags(ReadOnly) : memref<16xf32, strided<[1], offset: 32>> - %b11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) flags(ReadOnly) : memref<16xi32> - %b12 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c128) : memref<16xf32, strided<[1], offset: 32>> - %b21 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) flags(ReadOnly) : memref<16xi32> - %b22 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c128) flags(ReadOnly) : memref<16xf32, strided<[1], offset: 32>> - %2 = hal.interface.binding.subspan set(1) binding(3) type(storage_buffer) : memref<16xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<16xi32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%c128) flags(ReadOnly) : memref<16xf32, strided<[1], offset: 32>> + %b11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) flags(ReadOnly) : memref<16xi32> + %b12 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) offset(%c128) : memref<16xf32, strided<[1], offset: 32>> + %b21 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) flags(ReadOnly) : memref<16xi32> + %b22 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) offset(%c128) flags(ReadOnly) : memref<16xf32, strided<[1], offset: 32>> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(1) binding(3) : memref<16xf32> %3 = gpu.block_id x %4 = gpu.block_dim x %5 = gpu.thread_id x @@ -333,8 +334,7 @@ hal.executable @check_not_readonly { #pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<4, storage_buffer> + #hal.descriptor_set.binding<0, storage_buffer> ]>, #hal.descriptor_set.layout<1, bindings = [ #hal.descriptor_set.binding<2, storage_buffer> @@ -347,8 +347,8 @@ hal.executable @complex { func.func @complex() { %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c128) flags(ReadOnly) : memref<16xcomplex> - %2 = hal.interface.binding.subspan set(1) binding(2) type(storage_buffer) : memref<16xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%c128) flags(ReadOnly) : memref<16xcomplex> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(1) binding(2) : memref<16xf32> %3 = gpu.block_id x %4 = gpu.block_dim x %5 = gpu.thread_id x @@ -412,8 +412,8 @@ hal.executable @masked_load_store { %c0 = arith.constant 0 : index %idx = gpu.thread_id x %pass_thru = arith.constant dense<0.000000e+00> : vector<1xf32> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64xf32, #gpu.address_space> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<64xf32, #gpu.address_space> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<64xf32, #gpu.address_space> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<64xf32, #gpu.address_space> %mask = vector.create_mask %idx : vector<1xi1> %ld = vector.maskedload %0[%idx], %mask, %pass_thru : memref<64xf32, #gpu.address_space>, vector<1xi1>, vector<1xf32> into vector<1xf32> vector.maskedstore %1[%idx], %mask, %ld : memref<64xf32, #gpu.address_space>, vector<1xi1>, vector<1xf32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl.mlir index c7b7da13e2fe5..b4dd4ca0a8caa 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/convert_to_rocdl.mlir @@ -5,7 +5,8 @@ #pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<4, storage_buffer> + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> ]>, #hal.descriptor_set.layout<1, bindings = [ #hal.descriptor_set.binding<2, storage_buffer> @@ -17,9 +18,9 @@ hal.executable @abs_ex_dispatch_0 { builtin.module { func.func @abs_ex_dispatch_0() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) flags(ReadOnly) : memref<16xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<16xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) flags(ReadOnly) : memref<16xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<16xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<16xf32> %3 = gpu.block_id x %4 = gpu.block_dim x %5 = gpu.thread_id x @@ -50,7 +51,7 @@ hal.executable @abs_ex_dispatch_0 { #pipeline_layout = #hal.pipeline.layout, - #hal.descriptor_set.binding<4, storage_buffer> + #hal.descriptor_set.binding<1, storage_buffer> ]>, #hal.descriptor_set.layout<1, bindings = [ #hal.descriptor_set.binding<2, storage_buffer> @@ -62,9 +63,9 @@ hal.executable @abs_ex_dispatch_0 { builtin.module { func.func @reduction_maximum() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<32x64x64xf32, strided<[4096, 64, 1], offset: ?>> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<32x64x64xf32, + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<32x64x64xf32, strided<[4096, 64, 1], offset: ?>> %2 = vector.load %0[%c0, %c0, %c0] : memref<32x64x64xf32, strided<[4096, 64, 1], offset: ?>>, vector<2xf32> %3 = vector.reduction , %2 : vector<2xf32> into f32 @@ -114,8 +115,8 @@ hal.executable @masked_load_store { %c0 = arith.constant 0 : index %idx = gpu.thread_id x %pass_thru = arith.constant dense<0.000000e+00> : vector<1xf32> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<64xf32, #gpu.address_space> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<64xf32, #gpu.address_space> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<64xf32, #gpu.address_space> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<64xf32, #gpu.address_space> %mask = vector.create_mask %idx : vector<1xi1> %ld = vector.maskedload %0[%idx], %mask, %pass_thru : memref<64xf32, #gpu.address_space>, vector<1xi1>, vector<1xf32> into vector<1xf32> vector.maskedstore %1[%idx], %mask, %ld : memref<64xf32, #gpu.address_space>, vector<1xi1>, vector<1xf32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir index ad5478c11c948..c41952608ddc3 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir @@ -1,39 +1,44 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_60 --pass-pipeline="builtin.module(func.func(iree-llvmgpu-tile-and-distribute))" %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 2)> #map1 = affine_map<()[s0] -> (s0 * 256)> #map2 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)> #translation = #iree_codegen.translation_info -module { - func.func @dot_dispatch_0() attributes {translation_info = #translation} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x1024xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<1024x1024xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x1024xf32> - %workgroup_size_x = hal.interface.workgroup.size[0] : index - %workgroup_size_y = hal.interface.workgroup.size[1] : index - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %3 = affine.apply #map()[%workgroup_id_y] - %4 = affine.apply #map()[%workgroup_count_y] - scf.for %arg0 = %3 to %c1024 step %4 { - %5 = affine.apply #map1()[%workgroup_id_x] - %6 = affine.apply #map1()[%workgroup_count_x] - scf.for %arg1 = %5 to %c1024 step %6 { - %subview = memref.subview %0[%arg0, 0] [2, 1024] [1, 1] : memref<1024x1024xf32> to memref<2x1024xf32, #map2> - %subview_0 = memref.subview %1[0, %arg1] [1024, 256] [1, 1] : memref<1024x1024xf32> to memref<1024x256xf32, #map2> - %subview_1 = memref.subview %2[%arg0, %arg1] [2, 256] [1, 1] : memref<1024x1024xf32> to memref<2x256xf32, #map2> - linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%subview_1 : memref<2x256xf32, #map2>) - linalg.matmul {lowering_config = #config} ins(%subview, %subview_0 : memref<2x1024xf32, #map2>, memref<1024x256xf32, #map2>) outs(%subview_1 : memref<2x256xf32, #map2>) - } +func.func @dot_dispatch_0() attributes {translation_info = #translation} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %c1024 = arith.constant 1024 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<1024x1024xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<1024x1024xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<1024x1024xf32> + %workgroup_size_x = hal.interface.workgroup.size[0] : index + %workgroup_size_y = hal.interface.workgroup.size[1] : index + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %3 = affine.apply #map()[%workgroup_id_y] + %4 = affine.apply #map()[%workgroup_count_y] + scf.for %arg0 = %3 to %c1024 step %4 { + %5 = affine.apply #map1()[%workgroup_id_x] + %6 = affine.apply #map1()[%workgroup_count_x] + scf.for %arg1 = %5 to %c1024 step %6 { + %subview = memref.subview %0[%arg0, 0] [2, 1024] [1, 1] : memref<1024x1024xf32> to memref<2x1024xf32, #map2> + %subview_0 = memref.subview %1[0, %arg1] [1024, 256] [1, 1] : memref<1024x1024xf32> to memref<1024x256xf32, #map2> + %subview_1 = memref.subview %2[%arg0, %arg1] [2, 256] [1, 1] : memref<1024x1024xf32> to memref<2x256xf32, #map2> + linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%subview_1 : memref<2x256xf32, #map2>) + linalg.matmul {lowering_config = #config} ins(%subview, %subview_0 : memref<2x1024xf32, #map2>, memref<1024x256xf32, #map2>) outs(%subview_1 : memref<2x256xf32, #map2>) } - return } + return } // CHECK: func.func @dot_dispatch_0() @@ -65,6 +70,13 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 8)> #map1 = affine_map<()[s0] -> (s0 * 32)> @@ -72,42 +84,40 @@ module { #map3 = affine_map<(d0, d1, d2)[s0] -> (d0 * 65536 + s0 + d1 * 64 + d2)> #map4 = affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)> #translation = #iree_codegen.translation_info -module { - func.func @batch_matmul_func() attributes {translation_info = #translation} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %c4 = arith.constant 4 : index - %c32 = arith.constant 32 : index - %c64 = arith.constant 64 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : memref<4x32x1024xf32> - memref.assume_alignment %0, 32 : memref<4x32x1024xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : memref<4x1024x64xf32> - memref.assume_alignment %1, 32 : memref<4x1024x64xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0) : memref<4x32x64xf32> - memref.assume_alignment %2, 32 : memref<4x32x64xf32> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %workgroup_id_z = hal.interface.workgroup.id[2] : index - %workgroup_count_z = hal.interface.workgroup.count[2] : index - scf.for %arg0 = %workgroup_id_z to %c4 step %workgroup_count_z { - %3 = affine.apply #map()[%workgroup_id_y] - %4 = affine.apply #map()[%workgroup_count_y] - scf.for %arg1 = %3 to %c32 step %4 { - %5 = affine.apply #map1()[%workgroup_id_x] - %6 = affine.apply #map1()[%workgroup_count_x] - scf.for %arg2 = %5 to %c64 step %6 { - %subview = memref.subview %0[%arg0, %arg1, 0] [1, 8, 1024] [1, 1, 1] : memref<4x32x1024xf32> to memref<1x8x1024xf32, #map2> - %subview_0 = memref.subview %1[%arg0, 0, %arg2] [1, 1024, 32] [1, 1, 1] : memref<4x1024x64xf32> to memref<1x1024x32xf32, #map3> - %subview_1 = memref.subview %2[%arg0, %arg1, %arg2] [1, 8, 32] [1, 1, 1] : memref<4x32x64xf32> to memref<1x8x32xf32, #map4> - linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%subview_1 : memref<1x8x32xf32, #map4>) - linalg.batch_matmul {lowering_config = #config} ins(%subview, %subview_0 : memref<1x8x1024xf32, #map2>, memref<1x1024x32xf32, #map3>) outs(%subview_1 : memref<1x8x32xf32, #map4>) - } +func.func @batch_matmul_func() attributes {translation_info = #translation} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %c4 = arith.constant 4 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) offset(%c0) : memref<4x32x1024xf32> + memref.assume_alignment %0, 32 : memref<4x32x1024xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) offset(%c0) : memref<4x1024x64xf32> + memref.assume_alignment %1, 32 : memref<4x1024x64xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(32) offset(%c0) : memref<4x32x64xf32> + memref.assume_alignment %2, 32 : memref<4x32x64xf32> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %workgroup_id_z = hal.interface.workgroup.id[2] : index + %workgroup_count_z = hal.interface.workgroup.count[2] : index + scf.for %arg0 = %workgroup_id_z to %c4 step %workgroup_count_z { + %3 = affine.apply #map()[%workgroup_id_y] + %4 = affine.apply #map()[%workgroup_count_y] + scf.for %arg1 = %3 to %c32 step %4 { + %5 = affine.apply #map1()[%workgroup_id_x] + %6 = affine.apply #map1()[%workgroup_count_x] + scf.for %arg2 = %5 to %c64 step %6 { + %subview = memref.subview %0[%arg0, %arg1, 0] [1, 8, 1024] [1, 1, 1] : memref<4x32x1024xf32> to memref<1x8x1024xf32, #map2> + %subview_0 = memref.subview %1[%arg0, 0, %arg2] [1, 1024, 32] [1, 1, 1] : memref<4x1024x64xf32> to memref<1x1024x32xf32, #map3> + %subview_1 = memref.subview %2[%arg0, %arg1, %arg2] [1, 8, 32] [1, 1, 1] : memref<4x32x64xf32> to memref<1x8x32xf32, #map4> + linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%subview_1 : memref<1x8x32xf32, #map4>) + linalg.batch_matmul {lowering_config = #config} ins(%subview, %subview_0 : memref<1x8x1024xf32, #map2>, memref<1x1024x32xf32, #map3>) outs(%subview_1 : memref<1x8x32xf32, #map4>) } } - return } + return } // CHECK: #[[$MAP:.*]] = affine_map<()[s0] -> (s0 * 4)> @@ -133,40 +143,45 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 2)> #map1 = affine_map<()[s0] -> (s0 * 32)> #map2 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)> #translation = #iree_codegen.translation_info -module { - func.func @dot_dispatch_0() attributes {translation_info = #translation} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x1024xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<1024x1024xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x1024xf32> - %workgroup_size_x = hal.interface.workgroup.size[0] : index - %workgroup_size_y = hal.interface.workgroup.size[1] : index - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %3 = affine.apply #map()[%workgroup_id_y] - %4 = affine.apply #map()[%workgroup_count_y] - scf.for %arg0 = %3 to %c1024 step %4 { - %5 = affine.apply #map1()[%workgroup_id_x] - %6 = affine.apply #map1()[%workgroup_count_x] - scf.for %arg1 = %5 to %c1024 step %6 { - %subview = memref.subview %0[%arg0, 0] [2, 1024] [1, 1] : memref<1024x1024xf32> to memref<2x1024xf32, #map2> - %subview_0 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32> to memref<1024x32xf32, #map2> - %subview_1 = memref.subview %2[%arg0, %arg1] [2, 32] [1, 1] : memref<1024x1024xf32> to memref<2x32xf32, #map2> - linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%subview_1 : memref<2x32xf32, #map2>) - linalg.matmul {lowering_config = #config} ins(%subview, %subview_0 : memref<2x1024xf32, #map2>, memref<1024x32xf32, #map2>) outs(%subview_1 : memref<2x32xf32, #map2>) - } +func.func @dot_dispatch_0() attributes {translation_info = #translation} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %c1024 = arith.constant 1024 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<1024x1024xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<1024x1024xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<1024x1024xf32> + %workgroup_size_x = hal.interface.workgroup.size[0] : index + %workgroup_size_y = hal.interface.workgroup.size[1] : index + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %3 = affine.apply #map()[%workgroup_id_y] + %4 = affine.apply #map()[%workgroup_count_y] + scf.for %arg0 = %3 to %c1024 step %4 { + %5 = affine.apply #map1()[%workgroup_id_x] + %6 = affine.apply #map1()[%workgroup_count_x] + scf.for %arg1 = %5 to %c1024 step %6 { + %subview = memref.subview %0[%arg0, 0] [2, 1024] [1, 1] : memref<1024x1024xf32> to memref<2x1024xf32, #map2> + %subview_0 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32> to memref<1024x32xf32, #map2> + %subview_1 = memref.subview %2[%arg0, %arg1] [2, 32] [1, 1] : memref<1024x1024xf32> to memref<2x32xf32, #map2> + linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%subview_1 : memref<2x32xf32, #map2>) + linalg.matmul {lowering_config = #config} ins(%subview, %subview_0 : memref<2x1024xf32, #map2>, memref<1024x32xf32, #map2>) outs(%subview_1 : memref<2x32xf32, #map2>) } - return } + return } // CHECK: func.func @dot_dispatch_0() @@ -200,28 +215,32 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<(d0) -> (d0)> #map1 = affine_map<(d0) -> ()> #translation = #iree_codegen.translation_info -module { - func.func @predict_dispatch_153() attributes {translation_info = #translation} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0x7FC00000 : f32 - %cst_0 = arith.constant 0xFF800000 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1000xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref - linalg.fill {lowering_config = #config} ins(%cst_0 : f32) outs(%1 : memref) - linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["reduction"]} ins(%0 : memref<1000xf32>) outs(%1 : memref) attrs = {lowering_config = #config} { - ^bb0(%in: f32, %out: f32): - %2 = arith.cmpf ogt, %in, %out : f32 - %3 = arith.select %2, %in, %out : f32 - %4 = arith.cmpf uno, %in, %out : f32 - %5 = arith.select %4, %cst, %3 : f32 - linalg.yield %5 : f32 - } - return +func.func @predict_dispatch_153() attributes {translation_info = #translation} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0x7FC00000 : f32 + %cst_0 = arith.constant 0xFF800000 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<1000xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref + linalg.fill {lowering_config = #config} ins(%cst_0 : f32) outs(%1 : memref) + linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["reduction"]} ins(%0 : memref<1000xf32>) outs(%1 : memref) attrs = {lowering_config = #config} { + ^bb0(%in: f32, %out: f32): + %2 = arith.cmpf ogt, %in, %out : f32 + %3 = arith.select %2, %in, %out : f32 + %4 = arith.cmpf uno, %in, %out : f32 + %5 = arith.select %4, %cst, %3 : f32 + linalg.yield %5 : f32 } + return } // CHECK: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -234,6 +253,13 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 256)> #map1 = affine_map<(d0) -> (256, -d0 + 56)> @@ -248,11 +274,11 @@ module { %c41664 = arith.constant 41664 : index %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<1x64x56x56xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<1x64x56x56xf32> memref.assume_alignment %0, 64 : memref<1x64x56x56xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c41664) : memref<64x64x1x1xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c41664) : memref<64x64x1x1xf32> memref.assume_alignment %1, 64 : memref<64x64x1x1xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c802816) : memref<1x64x56x56xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c802816) : memref<1x64x56x56xf32> memref.assume_alignment %2, 64 : memref<1x64x56x56xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index @@ -290,6 +316,13 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info #map = affine_map<()[s0] -> (s0 * 2)> @@ -307,11 +340,11 @@ module { %c0 = arith.constant 0 : index %cst = arith.constant 8.000000e+00 : f32 %cst_0 = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = arith.index_cast %0 : i32 to index - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%1) : memref{%1, %1} - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%1) : memref{%1, %1} - %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref{%1, %1, %1} + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%1) : memref{%1, %1} + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%1) : memref{%1, %1} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref{%1, %1, %1} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/elementwise_pipeline.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/elementwise_pipeline.mlir index ac31f0daf3555..30e5278e09ac2 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/elementwise_pipeline.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/elementwise_pipeline.mlir @@ -1,23 +1,27 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_60 --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2, d3) -> (d2, d1, d0, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -module { - func.func @forward_dispatch_0_generic_320x320x3x3() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [3, 320, 320, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x320x320x3xf32> - %3 = tensor.empty() : tensor<320x320x3x3xf32> - %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<3x320x320x3xf32>) outs(%3 : tensor<320x320x3x3xf32>) { - ^bb0(%in: f32, %out: f32): - %5 = arith.addf %in, %cst : f32 - linalg.yield %5 : f32 - } -> tensor<320x320x3x3xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [320, 320, 3, 3], strides = [1, 1, 1, 1] : tensor<320x320x3x3xf32> -> !flow.dispatch.tensor> - return - } +func.func @forward_dispatch_0_generic_320x320x3x3() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [3, 320, 320, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x320x320x3xf32> + %3 = tensor.empty() : tensor<320x320x3x3xf32> + %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<3x320x320x3xf32>) outs(%3 : tensor<320x320x3x3xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = arith.addf %in, %cst : f32 + linalg.yield %5 : f32 + } -> tensor<320x320x3x3xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [320, 320, 3, 3], strides = [1, 1, 1, 1] : tensor<320x320x3x3xf32> -> !flow.dispatch.tensor> + return } // CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info // CHECK: func.func @forward_dispatch_0_generic_320x320x3x3() diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_pipeline_generalize_named_ops.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_pipeline_generalize_named_ops.mlir index 1ea52b6638073..74b075c71e1d5 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_pipeline_generalize_named_ops.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_pipeline_generalize_named_ops.mlir @@ -10,21 +10,26 @@ // CHECK-NEXT: linalg.generic // CHECK-NOT: linalg.matmul_transpose_b -module { - func.func @warp_reduction_large_vector() { - %cst = arith.constant 0.000000e+00 : f32 - %c128 = arith.constant 128 : index - %c0 = arith.constant 0 : index - %c394240 = arith.constant 394240 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c128) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c394240) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1280xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1280, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1280x1280xf32> - %5 = tensor.empty() : tensor<1x1280xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x1280xf32>) -> tensor<1x1280xf32> - %7 = linalg.matmul_transpose_b ins(%3, %4 : tensor<1x1280xf32>, tensor<1280x1280xf32>) outs(%6 : tensor<1x1280xf32>) -> tensor<1x1280xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 1280], strides = [1, 1] : tensor<1x1280xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @warp_reduction_large_vector() { + %cst = arith.constant 0.000000e+00 : f32 + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %c394240 = arith.constant 394240 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c128) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c394240) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1280xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1280, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1280x1280xf32> + %5 = tensor.empty() : tensor<1x1280xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x1280xf32>) -> tensor<1x1280xf32> + %7 = linalg.matmul_transpose_b ins(%3, %4 : tensor<1x1280xf32>, tensor<1280x1280xf32>) outs(%6 : tensor<1x1280xf32>) -> tensor<1x1280xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1, 1280], strides = [1, 1] : tensor<1x1280xf32> -> !flow.dispatch.tensor> + return } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir index 69c845359bcc1..3c3932ccf98f1 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir @@ -5,24 +5,29 @@ // Transform dialect attributes are tested separately. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0) -> (d0)> -module { - func.func @add_dispatch_0() { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = tensor.empty() : tensor<16384xf32> - %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [16384], strides = [1] : !flow.dispatch.tensor> -> tensor<16384xf32> - %5 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [16384], strides = [1] : !flow.dispatch.tensor> -> tensor<16384xf32> - %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%4, %5 : tensor<16384xf32>, tensor<16384xf32>) outs(%3 : tensor<16384xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %7 = arith.addf %in, %in_0 : f32 - linalg.yield %7 : f32 - } -> tensor<16384xf32> - flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [16384], strides = [1] : tensor<16384xf32> -> !flow.dispatch.tensor> - return - } +func.func @add_dispatch_0() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = tensor.empty() : tensor<16384xf32> + %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [16384], strides = [1] : !flow.dispatch.tensor> -> tensor<16384xf32> + %5 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [16384], strides = [1] : !flow.dispatch.tensor> -> tensor<16384xf32> + %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%4, %5 : tensor<16384xf32>, tensor<16384xf32>) outs(%3 : tensor<16384xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %7 = arith.addf %in, %in_0 : f32 + linalg.yield %7 : f32 + } -> tensor<16384xf32> + flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [16384], strides = [1] : tensor<16384xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -34,19 +39,24 @@ module { // ----- -module { - func.func @dot_dispatch_1() { - %c0 = arith.constant 0 : index - %c4 = arith.constant 4 : index - %c2 = arith.constant 2 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<2x3xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<3x4xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<2x4xf32> - linalg.fill ins(%cst : f32) outs(%2 : memref<2x4xf32>) - linalg.matmul ins(%0, %1 : memref<2x3xf32>, memref<3x4xf32>) outs(%2 : memref<2x4xf32>) - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @dot_dispatch_1() { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c2 = arith.constant 2 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<2x3xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<3x4xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<2x4xf32> + linalg.fill ins(%cst : f32) outs(%2 : memref<2x4xf32>) + linalg.matmul ins(%0, %1 : memref<2x3xf32>, memref<3x4xf32>) outs(%2 : memref<2x4xf32>) + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -60,19 +70,24 @@ module { // ----- -module { - func.func @unaligned_k() { - %c0 = arith.constant 0 : index - %c4 = arith.constant 4 : index - %c2 = arith.constant 2 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<128x258xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<258x64xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<128x64xf32> - linalg.fill ins(%cst : f32) outs(%2 : memref<128x64xf32>) - linalg.matmul ins(%0, %1 : memref<128x258xf32>, memref<258x64xf32>) outs(%2 : memref<128x64xf32>) - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @unaligned_k() { + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %c2 = arith.constant 2 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<128x258xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<258x64xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<128x64xf32> + linalg.fill ins(%cst : f32) outs(%2 : memref<128x64xf32>) + linalg.matmul ins(%0, %1 : memref<128x258xf32>, memref<258x64xf32>) outs(%2 : memref<128x64xf32>) + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -86,26 +101,30 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<(d0) -> (d0)> #map1 = affine_map<(d0) -> ()> -module { - func.func @predict_dispatch_153() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0x7FC00000 : f32 - %cst_0 = arith.constant 0xFF800000 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1000xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref - linalg.fill ins(%cst_0 : f32) outs(%1 : memref) - linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["reduction"]} ins(%0 : memref<1000xf32>) outs(%1 : memref) { - ^bb0(%in: f32, %out: f32): - %2 = arith.cmpf ogt, %in, %out : f32 - %3 = arith.select %2, %in, %out : f32 - %4 = arith.cmpf uno, %in, %out : f32 - %5 = arith.select %4, %cst, %3 : f32 - linalg.yield %5 : f32 - } - return +func.func @predict_dispatch_153() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0x7FC00000 : f32 + %cst_0 = arith.constant 0xFF800000 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<1000xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref + linalg.fill ins(%cst_0 : f32) outs(%1 : memref) + linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["reduction"]} ins(%0 : memref<1000xf32>) outs(%1 : memref) { + ^bb0(%in: f32, %out: f32): + %2 = arith.cmpf ogt, %in, %out : f32 + %3 = arith.select %2, %in, %out : f32 + %4 = arith.cmpf uno, %in, %out : f32 + %5 = arith.select %4, %cst, %3 : f32 + linalg.yield %5 : f32 } + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -119,25 +138,29 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d2, d0, d1)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> -module { - func.func @reduction_aligned2() { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 128, 384], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x128x384xf32> - %3 = tensor.empty() : tensor<128x384xf32> - %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<128x384xf32>) -> tensor<128x384xf32> - %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<4x128x384xf32>) outs(%4 : tensor<128x384xf32>) { - ^bb0(%in: f32, %out: f32): - %6 = arith.addf %in, %out : f32 - linalg.yield %6 : f32 - } -> tensor<128x384xf32> - flow.dispatch.tensor.store %5, %1, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : tensor<128x384xf32> -> !flow.dispatch.tensor> - return - } +func.func @reduction_aligned2() { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 128, 384], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x128x384xf32> + %3 = tensor.empty() : tensor<128x384xf32> + %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<128x384xf32>) -> tensor<128x384xf32> + %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<4x128x384xf32>) outs(%4 : tensor<128x384xf32>) { + ^bb0(%in: f32, %out: f32): + %6 = arith.addf %in, %out : f32 + linalg.yield %6 : f32 + } -> tensor<128x384xf32> + flow.dispatch.tensor.store %5, %1, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : tensor<128x384xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -151,20 +174,24 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @copy_as_generic() { - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref{%0, %1} - %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref{%0, %1} - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : memref) outs(%3 : memref) { - ^bb0(%in: i32, %out: i32): - linalg.yield %in : i32 - } - return +func.func @copy_as_generic() { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref{%0, %1} + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref{%0, %1} + linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : memref) outs(%3 : memref) { + ^bb0(%in: i32, %out: i32): + linalg.yield %in : i32 } + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -176,21 +203,25 @@ module { // ----- -module { - func.func @static_1d_fft_stage2() { - %c0 = arith.constant 0 : index - %c2 = arith.constant 2 : index - %cst = arith.constant dense<[1.000000e+00, 6.12323426E-17]> : tensor<2xf32> - %cst_0 = arith.constant dense<[-0.000000e+00, -1.000000e+00]> : tensor<2xf32> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xf32> - %3 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xf32> - %4:2 = iree_linalg_ext.fft {__internal_linalg_transform__ = "workgroup"} ins(%c2, %cst, %cst_0 : index, tensor<2xf32>, tensor<2xf32>) outs(%2, %3 : tensor<32xf32>, tensor<32xf32>) : tensor<32xf32>, tensor<32xf32> - flow.dispatch.tensor.store %4#0, %0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor> - flow.dispatch.tensor.store %4#1, %1, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @static_1d_fft_stage2() { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %cst = arith.constant dense<[1.000000e+00, 6.12323426E-17]> : tensor<2xf32> + %cst_0 = arith.constant dense<[-0.000000e+00, -1.000000e+00]> : tensor<2xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xf32> + %3 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xf32> + %4:2 = iree_linalg_ext.fft {__internal_linalg_transform__ = "workgroup"} ins(%c2, %cst, %cst_0 : index, tensor<2xf32>, tensor<2xf32>) outs(%2, %3 : tensor<32xf32>, tensor<32xf32>) : tensor<32xf32>, tensor<32xf32> + flow.dispatch.tensor.store %4#0, %0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor> + flow.dispatch.tensor.store %4#1, %1, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -202,22 +233,26 @@ module { // ----- -module { - func.func @static_3d_fft_stage3() { - %c0 = arith.constant 0 : index - %c3 = arith.constant 3 : index - %c64 = arith.constant 64 : index - %c128 = arith.constant 128 : index - %c32 = arith.constant 32 : index - %cst = arith.constant dense<[1.000000e+00, 0.707106769, 6.12323426E-17, -0.707106769]> : tensor<4xf32> - %cst_0 = arith.constant dense<[-0.000000e+00, -0.707106769, -1.000000e+00, -0.707106769]> : tensor<4xf32> - %0 = bufferization.to_memref %cst_0 : memref<4xf32> - %1 = bufferization.to_memref %cst : memref<4xf32> - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x128x32xf32> - %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<64x128x32xf32> - iree_linalg_ext.fft {__internal_linalg_transform__ = "workgroup"} ins(%c3, %1, %0 : index, memref<4xf32>, memref<4xf32>) outs(%2, %3 : memref<64x128x32xf32>, memref<64x128x32xf32>) - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @static_3d_fft_stage3() { + %c0 = arith.constant 0 : index + %c3 = arith.constant 3 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c32 = arith.constant 32 : index + %cst = arith.constant dense<[1.000000e+00, 0.707106769, 6.12323426E-17, -0.707106769]> : tensor<4xf32> + %cst_0 = arith.constant dense<[-0.000000e+00, -0.707106769, -1.000000e+00, -0.707106769]> : tensor<4xf32> + %0 = bufferization.to_memref %cst_0 : memref<4xf32> + %1 = bufferization.to_memref %cst : memref<4xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<64x128x32xf32> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<64x128x32xf32> + iree_linalg_ext.fft {__internal_linalg_transform__ = "workgroup"} ins(%c3, %1, %0 : index, memref<4xf32>, memref<4xf32>) outs(%2, %3 : memref<64x128x32xf32>, memref<64x128x32xf32>) + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -229,26 +264,31 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - func.func @_lowering_config_test_dispatch_1() { - %cst = arith.constant 0.000000e+00 : f32 - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf32> - %5 = tensor.empty() : tensor<128x1024xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x1024xf32>) -> tensor<128x1024xf32> - %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup", compilation_info = #compilation} ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%6 : tensor<128x1024xf32>) -> tensor<128x1024xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor> - return - } +func.func @_lowering_config_test_dispatch_1() { + %cst = arith.constant 0.000000e+00 : f32 + %c128 = arith.constant 128 : index + %c1024 = arith.constant 1024 : index + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf32> + %5 = tensor.empty() : tensor<128x1024xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x1024xf32>) -> tensor<128x1024xf32> + %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup", compilation_info = #compilation} ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%6 : tensor<128x1024xf32>) -> tensor<128x1024xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(32) offset(%c2304000) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 576000], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x576000xf32> - %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 576000], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x576000xi32> - %6:2 = iree_linalg_ext.sort dimension(1) outs(%4, %5 : tensor<1x576000xf32>, tensor<1x576000xi32>) { - ^bb0(%arg0: f32, %arg1: f32, %arg2: i32, %arg3: i32): - %7 = arith.cmpf ogt, %arg0, %arg1 : f32 - iree_linalg_ext.yield %7 : i1 - } -> tensor<1x576000xf32>, tensor<1x576000xi32> - flow.dispatch.tensor.store %6#0, %2, offsets = [0, 0], sizes = [1, 576000], strides = [1, 1] : tensor<1x576000xf32> -> !flow.dispatch.tensor> - flow.dispatch.tensor.store %6#1, %3, offsets = [0, 0], sizes = [1, 576000], strides = [1, 1] : tensor<1x576000xi32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> +func.func @sort_op() { + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c2304000 = arith.constant 2304000 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(32) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(32) offset(%c2304000) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 576000], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x576000xf32> + %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1, 576000], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x576000xi32> + %6:2 = iree_linalg_ext.sort dimension(1) outs(%4, %5 : tensor<1x576000xf32>, tensor<1x576000xi32>) { + ^bb0(%arg0: f32, %arg1: f32, %arg2: i32, %arg3: i32): + %7 = arith.cmpf ogt, %arg0, %arg1 : f32 + iree_linalg_ext.yield %7 : i1 + } -> tensor<1x576000xf32>, tensor<1x576000xi32> + flow.dispatch.tensor.store %6#0, %2, offsets = [0, 0], sizes = [1, 576000], strides = [1, 1] : tensor<1x576000xf32> -> !flow.dispatch.tensor> + flow.dispatch.tensor.store %6#1, %3, offsets = [0, 0], sizes = [1, 576000], strides = [1, 1] : tensor<1x576000xi32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -293,23 +339,28 @@ module { // ----- -module { - func.func @matmul_config_sm35() { - %cst = arith.constant 0.000000e+00 : f32 - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf32> - %5 = tensor.empty() : tensor<128x1024xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x1024xf32>) -> tensor<128x1024xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%6 : tensor<128x1024xf32>) -> tensor<128x1024xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_config_sm35() { + %cst = arith.constant 0.000000e+00 : f32 + %c128 = arith.constant 128 : index + %c1024 = arith.constant 1024 : index + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf32> + %5 = tensor.empty() : tensor<128x1024xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x1024xf32>) -> tensor<128x1024xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%6 : tensor<128x1024xf32>) -> tensor<128x1024xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -318,23 +369,28 @@ module { // ----- -module { - func.func @matmul_config_sm80() { - %cst = arith.constant 0.000000e+00 : f32 - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf32> - %5 = tensor.empty() : tensor<128x1024xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x1024xf32>) -> tensor<128x1024xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%6 : tensor<128x1024xf32>) -> tensor<128x1024xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_config_sm80() { + %cst = arith.constant 0.000000e+00 : f32 + %c128 = arith.constant 128 : index + %c1024 = arith.constant 1024 : index + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf32> + %5 = tensor.empty() : tensor<128x1024xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x1024xf32>) -> tensor<128x1024xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%6 : tensor<128x1024xf32>) -> tensor<128x1024xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor> + return } // SM80-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf32> - %5 = tensor.empty() : tensor<128x1024xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x1024xf32>) -> tensor<128x1024xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%6 : tensor<128x1024xf32>) -> tensor<128x1024xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_config_sm86() { + %cst = arith.constant 0.000000e+00 : f32 + %c128 = arith.constant 128 : index + %c1024 = arith.constant 1024 : index + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf32> + %5 = tensor.empty() : tensor<128x1024xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x1024xf32>) -> tensor<128x1024xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%6 : tensor<128x1024xf32>) -> tensor<128x1024xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor> + return } // SM80-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> ()> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> -module { - func.func @contract_reduction() { - %c0 = arith.constant 0 : index - %c40064 = arith.constant 40064 : index - %c34752 = arith.constant 34752 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c40064) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c34752) : !flow.dispatch.tensor> - %3 = tensor.empty() : tensor<3x64xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 4], sizes = [3, 64, 4, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x64x4xf32> - %5 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%3 : tensor<3x64xf32>) -> tensor<3x64xf32> - %6 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor - %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%4, %6 : tensor<3x64x4xf32>, tensor) outs(%5 : tensor<3x64xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %8 = arith.subf %in, %in_0 : f32 - %9 = arith.maximumf %8, %cst : f32 - %10 = arith.mulf %9, %9 : f32 - %11 = arith.addf %out, %10 : f32 - linalg.yield %11 : f32 - } -> tensor<3x64xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [3, 64], strides = [1, 1] : tensor<3x64xf32> -> !flow.dispatch.tensor> - return - } - +func.func @contract_reduction() { + %c0 = arith.constant 0 : index + %c40064 = arith.constant 40064 : index + %c34752 = arith.constant 34752 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c40064) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c34752) : !flow.dispatch.tensor> + %3 = tensor.empty() : tensor<3x64xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 4], sizes = [3, 64, 4, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x64x4xf32> + %5 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%3 : tensor<3x64xf32>) -> tensor<3x64xf32> + %6 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor + %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%4, %6 : tensor<3x64x4xf32>, tensor) outs(%5 : tensor<3x64xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %8 = arith.subf %in, %in_0 : f32 + %9 = arith.maximumf %8, %cst : f32 + %10 = arith.mulf %9, %9 : f32 + %11 = arith.addf %out, %10 : f32 + linalg.yield %11 : f32 + } -> tensor<3x64xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [3, 64], strides = [1, 1] : tensor<3x64xf32> -> !flow.dispatch.tensor> + return } // SM80-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info>{%4, %5} - %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%6, %7} - %10 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : !flow.dispatch.tensor>{%4, %5} -> tensor - %11 = tensor.empty(%6, %7) : tensor - %pack = tensor.pack %10 inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %11 : tensor -> tensor - flow.dispatch.tensor.store %pack, %9, offsets = [0, 0, 0, 0], sizes = [%6, %7, 2, 2], strides = [1, 1, 1, 1] : tensor -> !flow.dispatch.tensor>{%6, %7} - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @dynamic_pack_2x2() { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 + %4 = arith.index_castui %0 : i32 to index + %5 = arith.index_castui %1 : i32 to index + %6 = arith.index_castui %2 : i32 to index + %7 = arith.index_castui %3 : i32 to index + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c64) flags(ReadOnly) : !flow.dispatch.tensor>{%4, %5} + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%6, %7} + %10 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [%4, %5], strides = [1, 1] : !flow.dispatch.tensor>{%4, %5} -> tensor + %11 = tensor.empty(%6, %7) : tensor + %pack = tensor.pack %10 inner_dims_pos = [0, 1] inner_tiles = [2, 2] into %11 : tensor -> tensor + flow.dispatch.tensor.store %pack, %9, offsets = [0, 0, 0, 0], sizes = [%6, %7, 2, 2], strides = [1, 1, 1, 1] : tensor -> !flow.dispatch.tensor>{%6, %7} + return } // SM80-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -436,23 +505,28 @@ module { // ----- -module { - func.func @large_matmul_f16() { - %cst = arith.constant 0.000000e+00 : f16 - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2560, 1792], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2560x1792xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1792, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1792x2048xf16> - %5 = tensor.empty() : tensor<2560x2048xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2560x2048xf16>) -> tensor<2560x2048xf16> - %7 = linalg.matmul ins(%3, %4 : tensor<2560x1792xf16>, tensor<1792x2048xf16>) outs(%6 : tensor<2560x2048xf16>) -> tensor<2560x2048xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2560, 2048], strides = [1, 1] : tensor<2560x2048xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @large_matmul_f16() { + %cst = arith.constant 0.000000e+00 : f16 + %c128 = arith.constant 128 : index + %c1024 = arith.constant 1024 : index + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2560, 1792], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2560x1792xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1792, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1792x2048xf16> + %5 = tensor.empty() : tensor<2560x2048xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2560x2048xf16>) -> tensor<2560x2048xf16> + %7 = linalg.matmul ins(%3, %4 : tensor<2560x1792xf16>, tensor<1792x2048xf16>) outs(%6 : tensor<2560x2048xf16>) -> tensor<2560x2048xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2560, 2048], strides = [1, 1] : tensor<2560x2048xf16> -> !flow.dispatch.tensor> + return } // SM80-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -465,23 +539,28 @@ module { // ----- -module { - func.func @large_matmul_f32() { - %cst = arith.constant 0.000000e+00 : f32 - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2560, 1792], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2560x1792xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1792, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1792x2048xf32> - %5 = tensor.empty() : tensor<2560x2048xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2560x2048xf32>) -> tensor<2560x2048xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<2560x1792xf32>, tensor<1792x2048xf32>) outs(%6 : tensor<2560x2048xf32>) -> tensor<2560x2048xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2560, 2048], strides = [1, 1] : tensor<2560x2048xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @large_matmul_f32() { + %cst = arith.constant 0.000000e+00 : f32 + %c128 = arith.constant 128 : index + %c1024 = arith.constant 1024 : index + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2560, 1792], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2560x1792xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1792, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1792x2048xf32> + %5 = tensor.empty() : tensor<2560x2048xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2560x2048xf32>) -> tensor<2560x2048xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<2560x1792xf32>, tensor<1792x2048xf32>) outs(%6 : tensor<2560x2048xf32>) -> tensor<2560x2048xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2560, 2048], strides = [1, 1] : tensor<2560x2048xf32> -> !flow.dispatch.tensor> + return } // SM80-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @inner_unit_dim() { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = tensor.empty() : tensor<16384x1xf32> - %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [16384, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<16384x1xf32> - %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [16384, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<16384x1xf32> - %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%4, %5 : tensor<16384x1xf32>, tensor<16384x1xf32>) outs(%3 : tensor<16384x1xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %7 = arith.addf %in, %in_0 : f32 - linalg.yield %7 : f32 - } -> tensor<16384x1xf32> - flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [16384, 1], strides = [1, 1] : tensor<16384x1xf32> -> !flow.dispatch.tensor> - return - } +func.func @inner_unit_dim() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = tensor.empty() : tensor<16384x1xf32> + %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [16384, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<16384x1xf32> + %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [16384, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<16384x1xf32> + %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%4, %5 : tensor<16384x1xf32>, tensor<16384x1xf32>) outs(%3 : tensor<16384x1xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %7 = arith.addf %in, %in_0 : f32 + linalg.yield %7 : f32 + } -> tensor<16384x1xf32> + flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [16384, 1], strides = [1, 1] : tensor<16384x1xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -524,42 +608,47 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d3)> -module { - func.func @forward_dispatch_1_conv_2d_nhwc_hwcf_256x112x112x64x7x7x3_f32() { - %c0 = arith.constant 0 : index - %c162508800 = arith.constant 162508800 : index - %cst = arith.constant 1.001000e-05 : f32 - %cst_0 = arith.constant 0.000000e+00 : f32 - %cst_1 = arith.constant dense_resource<__elided__> : tensor<64xf32> - %cst_2 = arith.constant dense_resource<__elided__> : tensor<64xf32> - %cst_3 = arith.constant dense_resource<__elided__> : tensor<64xf32> - %cst_4 = arith.constant dense_resource<__elided__> : tensor<64xf32> - %cst_5 = arith.constant dense_resource<__elided__> : tensor<64xf32> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c162508800) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [256, 230, 230, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<256x230x230x3xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [7, 7, 3, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<7x7x3x64xf32> - %5 = tensor.empty() : tensor<256x112x112x64xf32> - %6 = linalg.fill ins(%cst_0 : f32) outs(%5 : tensor<256x112x112x64xf32>) -> tensor<256x112x112x64xf32> - %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<256x230x230x3xf32>, tensor<7x7x3x64xf32>) outs(%6 : tensor<256x112x112x64xf32>) -> tensor<256x112x112x64xf32> - %8 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map1, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7, %cst_1, %cst_2, %cst_3, %cst_4, %cst_5 : tensor<256x112x112x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) outs(%5 : tensor<256x112x112x64xf32>) { - ^bb0(%in: f32, %in_6: f32, %in_7: f32, %in_8: f32, %in_9: f32, %in_10: f32, %out: f32): - %9 = arith.addf %in_9, %cst : f32 - %10 = math.sqrt %9 : f32 - %11 = arith.addf %in, %in_6 : f32 - %12 = arith.subf %11, %in_7 : f32 - %13 = arith.mulf %12, %in_8 : f32 - %14 = arith.divf %13, %10 : f32 - %15 = arith.addf %14, %in_10 : f32 - %16 = arith.maximumf %15, %cst_0 : f32 - linalg.yield %16 : f32 - } -> tensor<256x112x112x64xf32> - flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0, 0], sizes = [256, 112, 112, 64], strides = [1, 1, 1, 1] : tensor<256x112x112x64xf32> -> !flow.dispatch.tensor> - return - } +func.func @forward_dispatch_1_conv_2d_nhwc_hwcf_256x112x112x64x7x7x3_f32() { + %c0 = arith.constant 0 : index + %c162508800 = arith.constant 162508800 : index + %cst = arith.constant 1.001000e-05 : f32 + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant dense_resource<__elided__> : tensor<64xf32> + %cst_2 = arith.constant dense_resource<__elided__> : tensor<64xf32> + %cst_3 = arith.constant dense_resource<__elided__> : tensor<64xf32> + %cst_4 = arith.constant dense_resource<__elided__> : tensor<64xf32> + %cst_5 = arith.constant dense_resource<__elided__> : tensor<64xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c162508800) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [256, 230, 230, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<256x230x230x3xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [7, 7, 3, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<7x7x3x64xf32> + %5 = tensor.empty() : tensor<256x112x112x64xf32> + %6 = linalg.fill ins(%cst_0 : f32) outs(%5 : tensor<256x112x112x64xf32>) -> tensor<256x112x112x64xf32> + %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<256x230x230x3xf32>, tensor<7x7x3x64xf32>) outs(%6 : tensor<256x112x112x64xf32>) -> tensor<256x112x112x64xf32> + %8 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map1, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7, %cst_1, %cst_2, %cst_3, %cst_4, %cst_5 : tensor<256x112x112x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) outs(%5 : tensor<256x112x112x64xf32>) { + ^bb0(%in: f32, %in_6: f32, %in_7: f32, %in_8: f32, %in_9: f32, %in_10: f32, %out: f32): + %9 = arith.addf %in_9, %cst : f32 + %10 = math.sqrt %9 : f32 + %11 = arith.addf %in, %in_6 : f32 + %12 = arith.subf %11, %in_7 : f32 + %13 = arith.mulf %12, %in_8 : f32 + %14 = arith.divf %13, %10 : f32 + %15 = arith.addf %14, %in_10 : f32 + %16 = arith.maximumf %15, %cst_0 : f32 + linalg.yield %16 : f32 + } -> tensor<256x112x112x64xf32> + flow.dispatch.tensor.store %8, %2, offsets = [0, 0, 0, 0], sizes = [256, 112, 112, 64], strides = [1, 1, 1, 1] : tensor<256x112x112x64xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d1, d4)> #map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d1, d4)> #map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)> #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -module { - func.func @_main_dispatch_15_generic_512x4x42x42x64_f32() { - %cst = arith.constant 1.250000e-01 : f32 - %cst_0 = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = arith.index_castui %0 {stream.alignment = 64 : index, stream.values = [35524672 : index, 240930880 : index, 446337088 : index, 651743296 : index]} : i32 to index - %4 = arith.index_castui %1 {stream.alignment = 64 : index, stream.values = [57544768 : index, 262950976 : index, 468357184 : index, 673763392 : index]} : i32 to index - %5 = arith.index_castui %2 {stream.alignment = 64 : index, stream.values = [1728 : index, 36472832 : index, 72943744 : index, 109415936 : index]} : i32 to index - %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%3) flags(ReadOnly) : !flow.dispatch.tensor> - %7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) flags(ReadOnly) : !flow.dispatch.tensor> - %8 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) : !flow.dispatch.tensor> - %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [512, 42, 4, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<512x42x4x64xf32> - %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [512, 42, 4, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<512x42x4x64xf32> - %11 = tensor.empty() : tensor<512x4x42x42xf32> - %12 = linalg.fill ins(%cst_0 : f32) outs(%11 : tensor<512x4x42x42xf32>) -> tensor<512x4x42x42xf32> - %13 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%9, %10 : tensor<512x42x4x64xf32>, tensor<512x42x4x64xf32>) outs(%12 : tensor<512x4x42x42xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %15 = arith.mulf %in, %in_1 : f32 - %16 = arith.addf %out, %15 : f32 - linalg.yield %16 : f32 - } -> tensor<512x4x42x42xf32> - %14 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<512x4x42x42xf32>) outs(%11 : tensor<512x4x42x42xf32>) { - ^bb0(%in: f32, %out: f32): - %15 = arith.mulf %in, %cst : f32 - linalg.yield %15 : f32 - } -> tensor<512x4x42x42xf32> - flow.dispatch.tensor.store %14, %8, offsets = [0, 0, 0, 0], sizes = [512, 4, 42, 42], strides = [1, 1, 1, 1] : tensor<512x4x42x42xf32> -> !flow.dispatch.tensor> - return - } +func.func @_main_dispatch_15_generic_512x4x42x42x64_f32() { + %cst = arith.constant 1.250000e-01 : f32 + %cst_0 = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = arith.index_castui %0 {stream.alignment = 64 : index, stream.values = [35524672 : index, 240930880 : index, 446337088 : index, 651743296 : index]} : i32 to index + %4 = arith.index_castui %1 {stream.alignment = 64 : index, stream.values = [57544768 : index, 262950976 : index, 468357184 : index, 673763392 : index]} : i32 to index + %5 = arith.index_castui %2 {stream.alignment = 64 : index, stream.values = [1728 : index, 36472832 : index, 72943744 : index, 109415936 : index]} : i32 to index + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%3) flags(ReadOnly) : !flow.dispatch.tensor> + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%4) flags(ReadOnly) : !flow.dispatch.tensor> + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%5) : !flow.dispatch.tensor> + %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [512, 42, 4, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<512x42x4x64xf32> + %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [512, 42, 4, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<512x42x4x64xf32> + %11 = tensor.empty() : tensor<512x4x42x42xf32> + %12 = linalg.fill ins(%cst_0 : f32) outs(%11 : tensor<512x4x42x42xf32>) -> tensor<512x4x42x42xf32> + %13 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%9, %10 : tensor<512x42x4x64xf32>, tensor<512x42x4x64xf32>) outs(%12 : tensor<512x4x42x42xf32>) { + ^bb0(%in: f32, %in_1: f32, %out: f32): + %15 = arith.mulf %in, %in_1 : f32 + %16 = arith.addf %out, %15 : f32 + linalg.yield %16 : f32 + } -> tensor<512x4x42x42xf32> + %14 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<512x4x42x42xf32>) outs(%11 : tensor<512x4x42x42xf32>) { + ^bb0(%in: f32, %out: f32): + %15 = arith.mulf %in, %cst : f32 + linalg.yield %15 : f32 + } -> tensor<512x4x42x42xf32> + flow.dispatch.tensor.store %14, %8, offsets = [0, 0, 0, 0], sizes = [512, 4, 42, 42], strides = [1, 1, 1, 1] : tensor<512x4x42x42xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> #map2 = affine_map<(d0, d1, d2) -> (d0, d2)> #map3 = affine_map<(d0, d1, d2) -> (d1, d2)> #map4 = affine_map<(d0, d1, d2) -> (d0, d1)> -module { - func.func @i4_dequant_matvec() { - %c32_i64 = arith.constant 32 : i64 - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = hal.interface.constant.load[4] : i32 - %5 = hal.interface.constant.load[5] : i32 - %6 = hal.interface.constant.load[6] : i32 - %7 = hal.interface.constant.load[7] : i32 - %8 = hal.interface.constant.load[8] : i32 - %9 = arith.index_castui %0 : i32 to index - %10 = arith.index_castui %1 : i32 to index - %11 = arith.index_castui %2 : i32 to index - %12 = arith.extui %3 : i32 to i64 - %13 = arith.extui %4 : i32 to i64 - %14 = arith.shli %13, %c32_i64 : i64 - %15 = arith.ori %12, %14 : i64 - %16 = arith.index_castui %15 : i64 to index - %17 = arith.extui %5 : i32 to i64 - %18 = arith.extui %6 : i32 to i64 - %19 = arith.shli %18, %c32_i64 : i64 - %20 = arith.ori %17, %19 : i64 - %21 = arith.index_castui %20 : i64 to index - %22 = arith.extui %7 : i32 to i64 - %23 = arith.extui %8 : i32 to i64 - %24 = arith.shli %23, %c32_i64 : i64 - %25 = arith.ori %22, %24 : i64 - %26 = arith.index_castui %25 : i64 to index - %27 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%9) flags(ReadOnly) : !flow.dispatch.tensor> - %28 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%10) flags(ReadOnly) : !flow.dispatch.tensor> - %29 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%11) flags(ReadOnly) : !flow.dispatch.tensor> - %30 = flow.dispatch.workload.ordinal %26, 0 : index - %31 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%16) flags(ReadOnly) : !flow.dispatch.tensor>{%30} - %32 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%21) : !flow.dispatch.tensor>{%30} - %33 = flow.dispatch.tensor.load %27, offsets = [0, 0], sizes = [4096, 11008], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x11008xi4> - %34 = flow.dispatch.tensor.load %28, offsets = [0], sizes = [4096], strides = [1] : !flow.dispatch.tensor> -> tensor<4096xf32> - %35 = flow.dispatch.tensor.load %29, offsets = [0], sizes = [4096], strides = [1] : !flow.dispatch.tensor> -> tensor<4096xf32> - %36 = flow.dispatch.tensor.load %31, offsets = [0, 0], sizes = [%30, 11008], strides = [1, 1] : !flow.dispatch.tensor>{%30} -> tensor - %37 = tensor.empty(%30) : tensor - %38 = tensor.empty() : tensor<4096x11008xf32> - %39 = linalg.fill ins(%cst : f32) outs(%37 : tensor) -> tensor - %40 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%33, %34, %35 : tensor<4096x11008xi4>, tensor<4096xf32>, tensor<4096xf32>) outs(%38 : tensor<4096x11008xf32>) { - ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32): - %42 = arith.extui %in : i4 to i32 - %43 = arith.uitofp %42 : i32 to f32 - %44 = arith.subf %43, %in_1 : f32 - %45 = arith.mulf %44, %in_0 : f32 - linalg.yield %45 : f32 - } -> tensor<4096x11008xf32> - %41 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%36, %40 : tensor, tensor<4096x11008xf32>) outs(%39 : tensor) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %42 = arith.mulf %in, %in_0 : f32 - %43 = arith.addf %42, %out : f32 - linalg.yield %43 : f32 - } -> tensor - flow.dispatch.tensor.store %41, %32, offsets = [0, 0], sizes = [%30, 4096], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%30} - return - } +func.func @i4_dequant_matvec() { + %c32_i64 = arith.constant 32 : i64 + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 + %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32 + %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32 + %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32 + %8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32 + %9 = arith.index_castui %0 : i32 to index + %10 = arith.index_castui %1 : i32 to index + %11 = arith.index_castui %2 : i32 to index + %12 = arith.extui %3 : i32 to i64 + %13 = arith.extui %4 : i32 to i64 + %14 = arith.shli %13, %c32_i64 : i64 + %15 = arith.ori %12, %14 : i64 + %16 = arith.index_castui %15 : i64 to index + %17 = arith.extui %5 : i32 to i64 + %18 = arith.extui %6 : i32 to i64 + %19 = arith.shli %18, %c32_i64 : i64 + %20 = arith.ori %17, %19 : i64 + %21 = arith.index_castui %20 : i64 to index + %22 = arith.extui %7 : i32 to i64 + %23 = arith.extui %8 : i32 to i64 + %24 = arith.shli %23, %c32_i64 : i64 + %25 = arith.ori %22, %24 : i64 + %26 = arith.index_castui %25 : i64 to index + %27 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%9) flags(ReadOnly) : !flow.dispatch.tensor> + %28 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%10) flags(ReadOnly) : !flow.dispatch.tensor> + %29 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%11) flags(ReadOnly) : !flow.dispatch.tensor> + %30 = flow.dispatch.workload.ordinal %26, 0 : index + %31 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%16) flags(ReadOnly) : !flow.dispatch.tensor>{%30} + %32 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%21) : !flow.dispatch.tensor>{%30} + %33 = flow.dispatch.tensor.load %27, offsets = [0, 0], sizes = [4096, 11008], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x11008xi4> + %34 = flow.dispatch.tensor.load %28, offsets = [0], sizes = [4096], strides = [1] : !flow.dispatch.tensor> -> tensor<4096xf32> + %35 = flow.dispatch.tensor.load %29, offsets = [0], sizes = [4096], strides = [1] : !flow.dispatch.tensor> -> tensor<4096xf32> + %36 = flow.dispatch.tensor.load %31, offsets = [0, 0], sizes = [%30, 11008], strides = [1, 1] : !flow.dispatch.tensor>{%30} -> tensor + %37 = tensor.empty(%30) : tensor + %38 = tensor.empty() : tensor<4096x11008xf32> + %39 = linalg.fill ins(%cst : f32) outs(%37 : tensor) -> tensor + %40 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%33, %34, %35 : tensor<4096x11008xi4>, tensor<4096xf32>, tensor<4096xf32>) outs(%38 : tensor<4096x11008xf32>) { + ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32): + %42 = arith.extui %in : i4 to i32 + %43 = arith.uitofp %42 : i32 to f32 + %44 = arith.subf %43, %in_1 : f32 + %45 = arith.mulf %44, %in_0 : f32 + linalg.yield %45 : f32 + } -> tensor<4096x11008xf32> + %41 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction"]} ins(%36, %40 : tensor, tensor<4096x11008xf32>) outs(%39 : tensor) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %42 = arith.mulf %in, %in_0 : f32 + %43 = arith.addf %42, %out : f32 + linalg.yield %43 : f32 + } -> tensor + flow.dispatch.tensor.store %41, %32, offsets = [0, 0], sizes = [%30, 4096], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%30} + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir index 832f16a00c4b3..7313f52c7614f 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir @@ -1,149 +1,201 @@ // RUN: iree-opt --iree-gpu-test-target=sm_60 --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" --verify-diagnostics --split-input-file %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info -module { - func.func @illegal() attributes {translation_info = #translation} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32> - // expected-error @+1 {{Total number of threads in a thread block 2048 exceeds the limit of 1024 with compilation pipeline LLVMGPUMatmulSimt}} - linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>) - return - } +func.func @illegal() attributes {translation_info = #translation} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<4x8xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<8x16xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<4x16xf32> + // expected-error @+1 {{Total number of threads in a thread block 2048 exceeds the limit of 1024 with compilation pipeline LLVMGPUMatmulSimt}} + linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>) + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info -module { - func.func @illegal() attributes {translation_info = #translation} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32> - // expected-error @+1 {{Expected workgroup size in z-dim = 1, but got 2 with compilation pipeline LLVMGPUMatmulSimt}} - linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>) - return - } +func.func @illegal() attributes {translation_info = #translation} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<4x8xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<8x16xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<4x16xf32> + // expected-error @+1 {{Expected workgroup size in z-dim = 1, but got 2 with compilation pipeline LLVMGPUMatmulSimt}} + linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>) + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info -module { - func.func @illegal() attributes {translation_info = #translation} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<32x16xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x32xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<32x32xf32> - // expected-error @+1 {{Total number of threads in a thread block 1280 exceeds the limit of 1024 with compilation pipeline LLVMGPUMatmulTensorCore}} - linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<32x16xf32>, memref<16x32xf32>) outs(%2 : memref<32x32xf32>) - return - } +func.func @illegal() attributes {translation_info = #translation} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<32x16xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<16x32xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<32x32xf32> + // expected-error @+1 {{Total number of threads in a thread block 1280 exceeds the limit of 1024 with compilation pipeline LLVMGPUMatmulTensorCore}} + linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<32x16xf32>, memref<16x32xf32>) outs(%2 : memref<32x32xf32>) + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info -module { - func.func @illegal() attributes {translation_info = #translation} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<32x16xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x32xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<32x32xf32> - // expected-error @+1 {{Number of threads in x-dim 48 is not a multiple of warp size (32) or integer units of warps in x-dim with compilation pipeline LLVMGPUMatmulTensorCore}} - linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<32x16xf32>, memref<16x32xf32>) outs(%2 : memref<32x32xf32>) - return - } +func.func @illegal() attributes {translation_info = #translation} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<32x16xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<16x32xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<32x32xf32> + // expected-error @+1 {{Number of threads in x-dim 48 is not a multiple of warp size (32) or integer units of warps in x-dim with compilation pipeline LLVMGPUMatmulTensorCore}} + linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<32x16xf32>, memref<16x32xf32>) outs(%2 : memref<32x32xf32>) + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info -module { - func.func @illegal() attributes {translation_info = #translation} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<32x16xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x32xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<32x32xf32> - // expected-error @+1 {{Expected workgroup size in z-dim = 1, but got 2 with compilation pipeline LLVMGPUMatmulTensorCore}} - linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<32x16xf32>, memref<16x32xf32>) outs(%2 : memref<32x32xf32>) - return - } +func.func @illegal() attributes {translation_info = #translation} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<32x16xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<16x32xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<32x32xf32> + // expected-error @+1 {{Expected workgroup size in z-dim = 1, but got 2 with compilation pipeline LLVMGPUMatmulTensorCore}} + linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<32x16xf32>, memref<16x32xf32>) outs(%2 : memref<32x32xf32>) + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info -module { - func.func @illegal() attributes {translation_info = #translation} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<32x16xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x32xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<32x32xf32> - // expected-error @+1 {{Thread block shape 32, 32, 20 cannot be tiled on matmul shape 32, 32, 16 with compilation pipeline LLVMGPUMatmulTensorCore}} - linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<32x16xf32>, memref<16x32xf32>) outs(%2 : memref<32x32xf32>) - return - } +func.func @illegal() attributes {translation_info = #translation} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<32x16xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<16x32xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<32x32xf32> + // expected-error @+1 {{Thread block shape 32, 32, 20 cannot be tiled on matmul shape 32, 32, 16 with compilation pipeline LLVMGPUMatmulTensorCore}} + linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<32x16xf32>, memref<16x32xf32>) outs(%2 : memref<32x32xf32>) + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info -module { - func.func @illegal() attributes {translation_info = #translation} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x512xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<512x256xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x256xf32> - // expected-error @+1 {{Tensor Core instruction shape 16, 16, 8 cannot be tiled on warp shape 64, 8, 16 with compilation pipeline LLVMGPUMatmulTensorCore}} - linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<1024x512xf32>, memref<512x256xf32>) outs(%2 : memref<1024x256xf32>) - return - } +func.func @illegal() attributes {translation_info = #translation} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<1024x512xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<512x256xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<1024x256xf32> + // expected-error @+1 {{Tensor Core instruction shape 16, 16, 8 cannot be tiled on warp shape 64, 8, 16 with compilation pipeline LLVMGPUMatmulTensorCore}} + linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<1024x512xf32>, memref<512x256xf32>) outs(%2 : memref<1024x256xf32>) + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info -module { - func.func @illegal() attributes {translation_info = #translation} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<48x16xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x32xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<48x32xf32> - // expected-error @+1 {{Thread block shape 32, 32, 16 cannot be tiled on matmul shape 48, 32, 16 with compilation pipeline LLVMGPUMatmulTensorCore}} - linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<48x16xf32>, memref<16x32xf32>) outs(%2 : memref<48x32xf32>) - return - } +func.func @illegal() attributes {translation_info = #translation} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<48x16xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<16x32xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<48x32xf32> + // expected-error @+1 {{Thread block shape 32, 32, 16 cannot be tiled on matmul shape 48, 32, 16 with compilation pipeline LLVMGPUMatmulTensorCore}} + linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<48x16xf32>, memref<16x32xf32>) outs(%2 : memref<48x32xf32>) + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info -module { - func.func @illegal() attributes {translation_info = #translation} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<32x16xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x48xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<32x48xf32> - // expected-error @+1 {{Thread block shape 32, 32, 16 cannot be tiled on matmul shape 32, 48, 16 with compilation pipeline LLVMGPUMatmulTensorCore}} - linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<32x16xf32>, memref<16x48xf32>) outs(%2 : memref<32x48xf32>) - return - } +func.func @illegal() attributes {translation_info = #translation} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<32x16xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<16x48xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<32x48xf32> + // expected-error @+1 {{Thread block shape 32, 32, 16 cannot be tiled on matmul shape 32, 48, 16 with compilation pipeline LLVMGPUMatmulTensorCore}} + linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<32x16xf32>, memref<16x48xf32>) outs(%2 : memref<32x48xf32>) + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 8)> #map1 = affine_map<()[s0] -> (s0 * 32)> @@ -151,89 +203,102 @@ module { #map3 = affine_map<(d0, d1, d2)[s0] -> (d0 * 65536 + s0 + d1 * 64 + d2)> #map4 = affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)> #translation = #iree_codegen.translation_info -module { - func.func @illegal() attributes {translation_info = #translation} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %c4 = arith.constant 4 : index - %c32 = arith.constant 32 : index - %c64 = arith.constant 64 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : memref<4x32x1024xf32> - memref.assume_alignment %0, 32 : memref<4x32x1024xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : memref<4x1024x64xf32> - memref.assume_alignment %1, 32 : memref<4x1024x64xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0) : memref<4x32x64xf32> - memref.assume_alignment %2, 32 : memref<4x32x64xf32> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %workgroup_id_z = hal.interface.workgroup.id[2] : index - %workgroup_count_z = hal.interface.workgroup.count[2] : index - scf.for %arg0 = %workgroup_id_z to %c4 step %workgroup_count_z { - %3 = affine.apply #map()[%workgroup_id_y] - %4 = affine.apply #map()[%workgroup_count_y] - scf.for %arg1 = %3 to %c32 step %4 { - %5 = affine.apply #map1()[%workgroup_id_x] - %6 = affine.apply #map1()[%workgroup_count_x] - scf.for %arg2 = %5 to %c64 step %6 { - %subview = memref.subview %0[%arg0, %arg1, 0] [1, 8, 1024] [1, 1, 1] : memref<4x32x1024xf32> to memref<1x8x1024xf32, #map2> - %subview_0 = memref.subview %1[%arg0, 0, %arg2] [1, 1024, 32] [1, 1, 1] : memref<4x1024x64xf32> to memref<1x1024x32xf32, #map3> - %subview_1 = memref.subview %2[%arg0, %arg1, %arg2] [1, 8, 32] [1, 1, 1] : memref<4x32x64xf32> to memref<1x8x32xf32, #map4> - linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%subview_1 : memref<1x8x32xf32, #map4>) - // expected-error @+1 {{Received batch tile dimension of 2 instead of 0 for non-partitionable loops with compilation pipeline LLVMGPUMatmulTensorCore}} - linalg.batch_matmul {lowering_config = #config} ins(%subview, %subview_0 : memref<1x8x1024xf32, #map2>, memref<1x1024x32xf32, #map3>) outs(%subview_1 : memref<1x8x32xf32, #map4>) - } +func.func @illegal() attributes {translation_info = #translation} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %c4 = arith.constant 4 : index + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) offset(%c0) : memref<4x32x1024xf32> + memref.assume_alignment %0, 32 : memref<4x32x1024xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) offset(%c0) : memref<4x1024x64xf32> + memref.assume_alignment %1, 32 : memref<4x1024x64xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(32) offset(%c0) : memref<4x32x64xf32> + memref.assume_alignment %2, 32 : memref<4x32x64xf32> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %workgroup_id_z = hal.interface.workgroup.id[2] : index + %workgroup_count_z = hal.interface.workgroup.count[2] : index + scf.for %arg0 = %workgroup_id_z to %c4 step %workgroup_count_z { + %3 = affine.apply #map()[%workgroup_id_y] + %4 = affine.apply #map()[%workgroup_count_y] + scf.for %arg1 = %3 to %c32 step %4 { + %5 = affine.apply #map1()[%workgroup_id_x] + %6 = affine.apply #map1()[%workgroup_count_x] + scf.for %arg2 = %5 to %c64 step %6 { + %subview = memref.subview %0[%arg0, %arg1, 0] [1, 8, 1024] [1, 1, 1] : memref<4x32x1024xf32> to memref<1x8x1024xf32, #map2> + %subview_0 = memref.subview %1[%arg0, 0, %arg2] [1, 1024, 32] [1, 1, 1] : memref<4x1024x64xf32> to memref<1x1024x32xf32, #map3> + %subview_1 = memref.subview %2[%arg0, %arg1, %arg2] [1, 8, 32] [1, 1, 1] : memref<4x32x64xf32> to memref<1x8x32xf32, #map4> + linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%subview_1 : memref<1x8x32xf32, #map4>) + // expected-error @+1 {{Received batch tile dimension of 2 instead of 0 for non-partitionable loops with compilation pipeline LLVMGPUMatmulTensorCore}} + linalg.batch_matmul {lowering_config = #config} ins(%subview, %subview_0 : memref<1x8x1024xf32, #map2>, memref<1x1024x32xf32, #map3>) outs(%subview_1 : memref<1x8x32xf32, #map4>) } } - return } + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info -module { - func.func @illegal() attributes {translation_info = #translation} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x512xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<512x256xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x256xf32> - // expected-error @+1 {{Thread block shape 64, 32, 48 cannot be tiled on matmul shape 1024, 256, 512 with compilation pipeline LLVMGPUMatmulTensorCoreMmaSync}} - linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<1024x512xf32>, memref<512x256xf32>) outs(%2 : memref<1024x256xf32>) - return - } +func.func @illegal() attributes {translation_info = #translation} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<1024x512xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<512x256xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<1024x256xf32> + // expected-error @+1 {{Thread block shape 64, 32, 48 cannot be tiled on matmul shape 1024, 256, 512 with compilation pipeline LLVMGPUMatmulTensorCoreMmaSync}} + linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<1024x512xf32>, memref<512x256xf32>) outs(%2 : memref<1024x256xf32>) + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info -module { - func.func @illegal() attributes {translation_info = #translation} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x512xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<512x256xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x256xf32> - // expected-error @+1 {{Tensor Core instruction shape 16, 8, 8 cannot be tiled on warp shape 64, 8, 4 with compilation pipeline LLVMGPUMatmulTensorCoreMmaSync}} - linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<1024x512xf32>, memref<512x256xf32>) outs(%2 : memref<1024x256xf32>) - return - } +func.func @illegal() attributes {translation_info = #translation} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<1024x512xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<512x256xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<1024x256xf32> + // expected-error @+1 {{Tensor Core instruction shape 16, 8, 8 cannot be tiled on warp shape 64, 8, 4 with compilation pipeline LLVMGPUMatmulTensorCoreMmaSync}} + linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<1024x512xf32>, memref<512x256xf32>) outs(%2 : memref<1024x256xf32>) + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info -module { - func.func @illegal() attributes {translation_info = #translation} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024x512xi8> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<512x256xi8> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1024x256xi8> - // expected-error @+1 {{Expected f16, bf16 or f32 for Tensor Core (MMA.SYNC) pipeline}} - linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<1024x512xi8>, memref<512x256xi8>) outs(%2 : memref<1024x256xi8>) - return - } +func.func @illegal() attributes {translation_info = #translation} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<1024x512xi8> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<512x256xi8> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<1024x256xi8> + // expected-error @+1 {{Expected f16, bf16 or f32 for Tensor Core (MMA.SYNC) pipeline}} + linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<1024x512xi8>, memref<512x256xi8>) outs(%2 : memref<1024x256xi8>) + return } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/layout_analysis_and_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/layout_analysis_and_distribution.mlir index fe8242329c887..933c15a6da530 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/layout_analysis_and_distribution.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/layout_analysis_and_distribution.mlir @@ -1,15 +1,22 @@ // RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule -cse -split-input-file --verify-diagnostics | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> builtin.module attributes { transform.with_named_sequence } { func.func @matmul_dispatch_0_matmul_16x8x16() { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<16x8xf16> %cst_0 = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<16x16xf16> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<16x16xf16> memref.assume_alignment %0, 64 : memref<16x16xf16> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8x16xf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<8x16xf16> memref.assume_alignment %1, 64 : memref<8x16xf16> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<16x8xf16> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<16x8xf16> memref.assume_alignment %2, 64 : memref<16x8xf16> %3 = vector.transfer_read %0[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16> %4 = vector.transfer_read %1[%c0, %c0], %cst_0 {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : memref<8x16xf16>, vector<8x16xf16> @@ -25,7 +32,6 @@ builtin.module attributes { transform.with_named_sequence } { } } // module - // CHECK-DAG: #[[MAP:.+]] = affine_map<(d0, d1, d2) -> (d1 + d2 * 16)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0 * 2)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0 * 2 + 1)> @@ -36,13 +42,13 @@ builtin.module attributes { transform.with_named_sequence } { // CHECK: func.func @matmul_dispatch_0_matmul_16x8x16() { // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<1x1x2x2xf16> -// CHECK: %[[D0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) +// CHECK: %[[D0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<16x16xf16> // CHECK: memref.assume_alignment %[[D0]], 64 : memref<16x16xf16> -// CHECK: %[[D1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) +// CHECK: %[[D1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<8x16xf16> // CHECK: memref.assume_alignment %[[D1]], 64 : memref<8x16xf16> -// CHECK: %[[D2:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) +// CHECK: %[[D2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) alignment(64) // CHECK-SAME: offset(%[[C0]]) : memref<16x8xf16> // CHECK: memref.assume_alignment %[[D2]], 64 : memref<16x8xf16> // CHECK-DAG: %[[D3:.+]] = gpu.thread_id x @@ -130,17 +136,24 @@ builtin.module attributes { transform.with_named_sequence } { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> builtin.module attributes { transform.with_named_sequence } { func.func @matmul_reduction() { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<16x8xf16> %init = arith.constant dense<-1.000000e+04> : vector<16xf16> %cst_0 = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<16x16xf16> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<16x16xf16> memref.assume_alignment %0, 64 : memref<16x16xf16> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8x16xf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<8x16xf16> memref.assume_alignment %1, 64 : memref<8x16xf16> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<16x8xf16> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<16x8xf16> memref.assume_alignment %2, 64 : memref<16x8xf16> %3 = vector.transfer_read %0[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16> %4 = vector.transfer_read %1[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<8x16xf16>, vector<8x16xf16> @@ -170,13 +183,13 @@ builtin.module attributes { transform.with_named_sequence } { // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<1x1x2x2xf16> // CHECK-DAG: %[[CST_0:.+]] = arith.constant dense<-1.000000e+04> : vector<1x1x2x2xf16> -// CHECK: %[[D0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) +// CHECK: %[[D0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<16x16xf16> // CHECK: memref.assume_alignment %[[D0]], 64 : memref<16x16xf16> -// CHECK: %[[D1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) +// CHECK: %[[D1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<8x16xf16> // CHECK: memref.assume_alignment %[[D1]], 64 : memref<8x16xf16> -// CHECK: %[[D2:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) +// CHECK: %[[D2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) alignment(64) // CHECK-SAME: offset(%[[C0]]) : memref<16x8xf16> // CHECK: memref.assume_alignment %[[D2]], 64 : memref<16x8xf16> // CHECK-DAG: %[[D3:.+]] = gpu.thread_id x @@ -314,6 +327,14 @@ builtin.module attributes { transform.with_named_sequence } { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<()[s0] -> (s0 * 16)> #map2 = affine_map<(d0)[s0] -> (d0 + s0)> @@ -327,13 +348,13 @@ builtin.module attributes { transform.with_named_sequence } { %c0 = arith.constant 0 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<16x64xf16> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<16x64xf16> memref.assume_alignment %0, 64 : memref<16x64xf16> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8x64xf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<8x64xf16> memref.assume_alignment %1, 64 : memref<8x64xf16> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<16x8xf16> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : memref<16x8xf16> memref.assume_alignment %2, 64 : memref<16x8xf16> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : memref<16x8xf16> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : memref<16x8xf16> memref.assume_alignment %3, 64 : memref<16x8xf16> %workgroup_id_x = hal.interface.workgroup.id[0] : index %4 = affine.apply #map1()[%workgroup_id_x] @@ -373,16 +394,16 @@ builtin.module attributes { transform.with_named_sequence } { // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK: %[[D0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) +// CHECK: %[[D0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<16x64xf16> // CHECK: memref.assume_alignment %[[D0]], 64 : memref<16x64xf16> -// CHECK: %[[D1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) +// CHECK: %[[D1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<8x64xf16> // CHECK: memref.assume_alignment %[[D1]], 64 : memref<8x64xf16> -// CHECK: %[[D2:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) +// CHECK: %[[D2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<16x8xf16> // CHECK: memref.assume_alignment %[[D2]], 64 : memref<16x8xf16> -// CHECK: %[[D3:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) +// CHECK: %[[D3:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) alignment(64) // CHECK-SAME: offset(%[[C0]]) : memref<16x8xf16> // CHECK: memref.assume_alignment %[[D3]], 64 : memref<16x8xf16> // CHECK: %[[WORKGROUP_ID_X:.+]] = hal.interface.workgroup.id[0] : index @@ -502,6 +523,14 @@ builtin.module attributes { transform.with_named_sequence } { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<()[s0] -> (s0 * 16)> #map2 = affine_map<(d0)[s0] -> (d0 + s0)> @@ -515,13 +544,13 @@ builtin.module attributes { transform.with_named_sequence } { %c0 = arith.constant 0 : index %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<16x64xf16> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<16x64xf16> memref.assume_alignment %0, 64 : memref<16x64xf16> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8x64xf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<8x64xf16> memref.assume_alignment %1, 64 : memref<8x64xf16> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<16x8xf16> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : memref<16x8xf16> memref.assume_alignment %2, 64 : memref<16x8xf16> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : memref<16x8xf16> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : memref<16x8xf16> memref.assume_alignment %3, 64 : memref<16x8xf16> %workgroup_id_x = hal.interface.workgroup.id[0] : index %4 = affine.apply #map1()[%workgroup_id_x] @@ -561,16 +590,16 @@ builtin.module attributes { transform.with_named_sequence } { // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK: %[[D0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) +// CHECK: %[[D0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<16x64xf16> // CHECK: memref.assume_alignment %[[D0]], 64 : memref<16x64xf16> -// CHECK: %[[D1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) +// CHECK: %[[D1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<8x64xf16> // CHECK: memref.assume_alignment %[[D1]], 64 : memref<8x64xf16> -// CHECK: %[[D2:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) +// CHECK: %[[D2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<16x8xf16> // CHECK: memref.assume_alignment %[[D2]], 64 : memref<16x8xf16> -// CHECK: %[[D3:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) +// CHECK: %[[D3:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) alignment(64) // CHECK-SAME: offset(%[[C0]]) : memref<16x8xf16> // CHECK: memref.assume_alignment %[[D3]], 64 : memref<16x8xf16> // CHECK: %[[WORKGROUP_ID_X:.+]] = hal.interface.workgroup.id[0] : index @@ -681,6 +710,14 @@ builtin.module attributes { transform.with_named_sequence } { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map2 = affine_map<(d0, d1, d2) -> (d0, d2)> #map3 = affine_map<(d0, d1, d2) -> (d1, d2)> #map4 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -689,13 +726,13 @@ builtin.module attributes { transform.with_named_sequence } { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<16x8xf16> %cst_1 = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<16x16xf16> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<16x16xf16> memref.assume_alignment %0, 64 : memref<16x16xf16> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8x16xf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<8x16xf16> memref.assume_alignment %1, 64 : memref<8x16xf16> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<16x8xf16> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : memref<16x8xf16> memref.assume_alignment %2, 64 : memref<16x8xf16> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : memref<16x8xf16> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : memref<16x8xf16> memref.assume_alignment %3, 64 : memref<16x8xf16> %5 = vector.transfer_read %0[%c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16> %6 = vector.transfer_read %1[%c0, %c0], %cst_1 {in_bounds = [true, true]} : memref<8x16xf16>, vector<8x16xf16> @@ -725,16 +762,16 @@ builtin.module attributes { transform.with_named_sequence } { // CHECK: func.func @matmul_dispatch_0_matmul_16x8x16() { // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<1x1x2x2xf16> -// CHECK: %[[D0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) +// CHECK: %[[D0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<16x16xf16> // CHECK: memref.assume_alignment %[[D0]], 64 : memref<16x16xf16> -// CHECK: %[[D1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) +// CHECK: %[[D1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<8x16xf16> // CHECK: memref.assume_alignment %[[D1]], 64 : memref<8x16xf16> -// CHECK: %[[D2:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) +// CHECK: %[[D2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<16x8xf16> // CHECK: memref.assume_alignment %[[D2]], 64 : memref<16x8xf16> -// CHECK: %[[D3:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) +// CHECK: %[[D3:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) alignment(64) // CHECK-SAME: offset(%[[C0]]) : memref<16x8xf16> // CHECK: memref.assume_alignment %[[D3]], 64 : memref<16x8xf16> // CHECK-DAG: %[[D4:.+]] = gpu.thread_id x @@ -842,14 +879,21 @@ builtin.module attributes { transform.with_named_sequence } { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> builtin.module attributes { transform.with_named_sequence } { func.func @matmul_dispatch_0_matmul_16x8x16_shared() { %c0 = arith.constant 0 : index %cst = arith.constant dense<0.000000e+00> : vector<16x8xf16> %cst_0 = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<16x16xf16, #gpu.address_space> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<8x16xf16, #gpu.address_space> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<16x8xf16> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<16x16xf16, #gpu.address_space> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<8x16xf16, #gpu.address_space> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<16x8xf16> memref.assume_alignment %2, 64 : memref<16x8xf16> %3 = vector.transfer_read %0[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf16, #gpu.address_space>, vector<16x16xf16> %4 = vector.transfer_read %1[%c0, %c0], %cst_0 {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : memref<8x16xf16, #gpu.address_space>, vector<8x16xf16> @@ -914,6 +958,17 @@ builtin.module attributes { transform.with_named_sequence } { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer>, + #hal.descriptor_set.binding<5, storage_buffer>, + #hal.descriptor_set.binding<6, storage_buffer> + ]> +]> #map = affine_map<(d0) -> (d0 * 16)> #map1 = affine_map<(d0, d1) -> (d1, d0)> #map2 = affine_map<(d0, d1, d2) -> (d0, d2)> @@ -925,19 +980,19 @@ builtin.module attributes { transform.with_named_sequence } { %cst = arith.constant dense<0.000000e+00> : vector<16x16xf16> %c0_0 = arith.constant 0 : index %cst_1 = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0_0) flags(ReadOnly) : memref<16x16xf16> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0_0) flags(ReadOnly) : memref<16x16xf16> memref.assume_alignment %0, 64 : memref<16x16xf16> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0_0) flags(ReadOnly) : memref<16x16xf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0_0) flags(ReadOnly) : memref<16x16xf16> memref.assume_alignment %1, 64 : memref<16x16xf16> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0_0) flags(ReadOnly) : memref<16xf16> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0_0) flags(ReadOnly) : memref<16xf16> memref.assume_alignment %2, 64 : memref<16xf16> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0_0) flags(ReadOnly) : memref<16xf16> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0_0) flags(ReadOnly) : memref<16xf16> memref.assume_alignment %3, 64 : memref<16xf16> - %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0_0) flags(ReadOnly) : memref<16x8xf16> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) alignment(64) offset(%c0_0) flags(ReadOnly) : memref<16x8xf16> memref.assume_alignment %4, 64 : memref<16x8xf16> - %5 = hal.interface.binding.subspan set(0) binding(5) type(storage_buffer) alignment(64) offset(%c0_0) flags(ReadOnly) : memref<16x8xf16> + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(5) alignment(64) offset(%c0_0) flags(ReadOnly) : memref<16x8xf16> memref.assume_alignment %5, 64 : memref<16x8xf16> - %6 = hal.interface.binding.subspan set(0) binding(6) type(storage_buffer) alignment(64) offset(%c0_0) : memref<16x8xf16> + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(6) alignment(64) offset(%c0_0) : memref<16x8xf16> memref.assume_alignment %6, 64 : memref<16x8xf16> %c1 = arith.constant 1 : index %c1_2 = arith.constant 1 : index @@ -996,25 +1051,25 @@ builtin.module attributes { transform.with_named_sequence } { // CHECK-DAG: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<16x16xf16> // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[CST_0:.+]] = arith.constant 0.000000e+00 : f16 -// CHECK: %[[D0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) +// CHECK: %[[D0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<16x16xf16> // CHECK: memref.assume_alignment %[[D0]], 64 : memref<16x16xf16> -// CHECK: %[[D1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) +// CHECK: %[[D1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<16x16xf16> // CHECK: memref.assume_alignment %[[D1]], 64 : memref<16x16xf16> -// CHECK: %[[D2:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) +// CHECK: %[[D2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<16xf16> // CHECK: memref.assume_alignment %[[D2]], 64 : memref<16xf16> -// CHECK: %[[D3:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) +// CHECK: %[[D3:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<16xf16> // CHECK: memref.assume_alignment %[[D3]], 64 : memref<16xf16> -// CHECK: %[[D4:.+]] = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) +// CHECK: %[[D4:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(4) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<16x8xf16> // CHECK: memref.assume_alignment %[[D4]], 64 : memref<16x8xf16> -// CHECK: %[[D5:.+]] = hal.interface.binding.subspan set(0) binding(5) type(storage_buffer) alignment(64) +// CHECK: %[[D5:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(5) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<16x8xf16> // CHECK: memref.assume_alignment %[[D5]], 64 : memref<16x8xf16> -// CHECK: %[[D6:.+]] = hal.interface.binding.subspan set(0) binding(6) type(storage_buffer) alignment(64) +// CHECK: %[[D6:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(6) alignment(64) // CHECK-SAME: offset(%[[C0]]) : memref<16x8xf16> // CHECK: memref.assume_alignment %[[D6]], 64 : memref<16x8xf16> // CHECK: %[[WORKGROUP_ID_X:.+]] = hal.interface.workgroup.id[0] : index @@ -1066,6 +1121,14 @@ builtin.module attributes { transform.with_named_sequence } { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map = affine_map<(d0) -> (d0 * 16)> #map1 = affine_map<(d0, d1, d2) -> (d0, d2)> #map2 = affine_map<(d0, d1, d2) -> (d1, d2)> @@ -1077,13 +1140,13 @@ builtin.module attributes { transform.with_named_sequence } { %cst_0 = arith.constant dense<0.000000e+00> : vector<16x8xf16> %c0_1 = arith.constant 0 : index %cst_2 = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0_1) flags(ReadOnly) : memref<16x16xf16> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0_1) flags(ReadOnly) : memref<16x16xf16> memref.assume_alignment %0, 64 : memref<16x16xf16> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0_1) flags(ReadOnly) : memref<16x16xf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0_1) flags(ReadOnly) : memref<16x16xf16> memref.assume_alignment %1, 64 : memref<16x16xf16> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0_1) flags(ReadOnly) : memref<8x16xf16> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0_1) flags(ReadOnly) : memref<8x16xf16> memref.assume_alignment %2, 64 : memref<8x16xf16> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0_1) : memref<16x8xf16> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0_1) : memref<16x8xf16> memref.assume_alignment %3, 64 : memref<16x8xf16> %c1 = arith.constant 1 : index %c1_3 = arith.constant 1 : index @@ -1125,16 +1188,16 @@ builtin.module attributes { transform.with_named_sequence } { // CHECK-DAG: %[[CST:.+]] = arith.constant dense<0.000000e+00> : vector<1x2x2x2xf16> // CHECK-DAG: %[[CST_0:.+]] = arith.constant dense<0.000000e+00> : vector<1x1x2x2xf16> // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK: %[[D0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) +// CHECK: %[[D0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<16x16xf16> // CHECK: memref.assume_alignment %[[D0]], 64 : memref<16x16xf16> -// CHECK: %[[D1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) +// CHECK: %[[D1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<16x16xf16> // CHECK: memref.assume_alignment %[[D1]], 64 : memref<16x16xf16> -// CHECK: %[[D2:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) +// CHECK: %[[D2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) alignment(64) // CHECK-SAME: offset(%[[C0]]) flags(ReadOnly) : memref<8x16xf16> // CHECK: memref.assume_alignment %[[D2]], 64 : memref<8x16xf16> -// CHECK: %[[D3:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) +// CHECK: %[[D3:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) alignment(64) // CHECK-SAME: offset(%[[C0]]) : memref<16x8xf16> // CHECK: memref.assume_alignment %[[D3]], 64 : memref<16x8xf16> // CHECK: %[[WORKGROUP_ID_X:.+]] = hal.interface.workgroup.id[0] : index diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir index da2bd6266633f..18100b1c924be 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir @@ -10,67 +10,71 @@ // RUN: --iree-codegen-transform-dialect-library=%p/transform_dialect_codegen_foreach_to_gpu_spec.mlir@__transform_main | \ // RUN: FileCheck %s --check-prefix=FOREACH-TO-GPU +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb"> -module { - func.func @matmul_static_dispatch_0() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [250, 500], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<250x500xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [500, 1020], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<500x1020xf32> - %5 = tensor.empty() : tensor<250x1020xf32> - %cst = arith.constant 0.000000e+00 : f32 - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<250x1020xf32>) -> tensor<250x1020xf32> +func.func @matmul_static_dispatch_0() attributes {hal.executable.target = #executable_target_cuda_nvptx_fb} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [250, 500], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<250x500xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [500, 1020], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<500x1020xf32> + %5 = tensor.empty() : tensor<250x1020xf32> + %cst = arith.constant 0.000000e+00 : f32 + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<250x1020xf32>) -> tensor<250x1020xf32> - // CHECK: memref.assume_alignment %{{.*}}, 64 : memref<250x1020xf32, #hal.descriptor_type> - // CHECK-NEXT: linalg.fill ins(%{{.*}} : f32) outs(%{{.*}} : memref<250x1020xf32, #hal.descriptor_type>) - // CHECK-NEXT: linalg.matmul{{.*}}ins(%{{.*}} : memref<250x500xf32, #hal.descriptor_type>, memref<500x1020xf32, #hal.descriptor_type>) outs(%{{.*}} : memref<250x1020xf32, #hal.descriptor_type>) - // CHECK-NEXT: return + // CHECK: memref.assume_alignment %{{.*}}, 64 : memref<250x1020xf32, #hal.descriptor_type> + // CHECK-NEXT: linalg.fill ins(%{{.*}} : f32) outs(%{{.*}} : memref<250x1020xf32, #hal.descriptor_type>) + // CHECK-NEXT: linalg.matmul{{.*}}ins(%{{.*}} : memref<250x500xf32, #hal.descriptor_type>, memref<500x1020xf32, #hal.descriptor_type>) outs(%{{.*}} : memref<250x1020xf32, #hal.descriptor_type>) + // CHECK-NEXT: return - // workgroup_size is explicitly set to [10, 11]. - // FOREACH-TO-GPU: #[[TRANSLATION:.+]] = #iree_codegen.translation_info - // FOREACH-TO-GPU: func.func @matmul_static_dispatch_0() - // FOREACH-TO-GPU-SAME: translation_info = #translation - // FOREACH-TO-GPU-DAG: %[[C0:.*]] = arith.constant 0 : index - // FOREACH-TO-GPU-DAG: %[[C1:.*]] = arith.constant 1 : index - // FOREACH-TO-GPU-DAG: %[[C5:.*]] = arith.constant 5 : index - // FOREACH-TO-GPU-DAG: %[[C7:.*]] = arith.constant 7 : index - // FOREACH-TO-GPU-DAG: %[[C9:.*]] = arith.constant 9 : index - // FOREACH-TO-GPU-DAG: %[[CF0:.*]] = arith.constant 0.000000e+00 : f32 - // FOREACH-TO-GPU: %[[TIDX:.*]] = gpu.thread_id x - // FOREACH-TO-GPU: %[[TIDY:.*]] = gpu.thread_id y - // - // Fill is tiled by 5x1 with thread_dim_mapping = [1, 0, 2], predicate appropriately. - // FOREACH-TO-GPU: %[[LT1:.*]] = arith.cmpi ult, %[[TIDX]], %[[C1]] : index - // FOREACH-TO-GPU: %[[LT5:.*]] = arith.cmpi ult, %[[TIDY]], %[[C5]] : index - // FOREACH-TO-GPU: %[[COND:.*]] = arith.andi %[[LT1]], %[[LT5]] : i1 - // FOREACH-TO-GPU: scf.if %[[COND]] { - // FOREACH-TO-GPU: affine.apply #{{.*}}()[%[[TIDY]]] - // FOREACH-TO-GPU: affine.apply #{{.*}}()[%[[TIDX]]] - // FOREACH-TO-GPU: linalg.fill - // FOREACH-TO-GPU: } - // FOREACH-TO-GPU: gpu.barrier - // - // Matmul is tiled by 7x9 with identity (omitted) thread_dim_mapping, predicate appropriately. - // FOREACH-TO-GPU: %[[LT7:.*]] = arith.cmpi ult, %[[TIDX]], %[[C7]] : index - // FOREACH-TO-GPU: %[[LT9:.*]] = arith.cmpi ult, %[[TIDY]], %[[C9]] : index - // FOREACH-TO-GPU: %[[COND2:.*]] = arith.andi %[[LT7]], %[[LT9]] : i1 - // FOREACH-TO-GPU: scf.if %[[COND2]] { - // FOREACH-TO-GPU: affine.min #{{.*}}()[%[[TIDX]]] - // FOREACH-TO-GPU: affine.min #{{.*}}()[%[[TIDY]]] - // FOREACH-TO-GPU-DAG: affine.apply #{{.*}}()[%[[TIDX]]] - // FOREACH-TO-GPU-DAG: %[[svA:.*]] = memref.subview {{.*}} : memref<250x500xf32{{.*}}> to memref to memref<500x?xf32 - // FOREACH-TO-GPU-DAG: %[[svC:.*]] = memref.subview {{.*}} : memref<250x1020xf32{{.*}}> to memref, memref<500x?xf32{{.*}}>) outs(%[[svC]] : memref) - // FOREACH-TO-GPU: } - // FOREACH-TO-GPU: gpu.barrier - // + // workgroup_size is explicitly set to [10, 11]. + // FOREACH-TO-GPU: #[[TRANSLATION:.+]] = #iree_codegen.translation_info + // FOREACH-TO-GPU: func.func @matmul_static_dispatch_0() + // FOREACH-TO-GPU-SAME: translation_info = #translation + // FOREACH-TO-GPU-DAG: %[[C0:.*]] = arith.constant 0 : index + // FOREACH-TO-GPU-DAG: %[[C1:.*]] = arith.constant 1 : index + // FOREACH-TO-GPU-DAG: %[[C5:.*]] = arith.constant 5 : index + // FOREACH-TO-GPU-DAG: %[[C7:.*]] = arith.constant 7 : index + // FOREACH-TO-GPU-DAG: %[[C9:.*]] = arith.constant 9 : index + // FOREACH-TO-GPU-DAG: %[[CF0:.*]] = arith.constant 0.000000e+00 : f32 + // FOREACH-TO-GPU: %[[TIDX:.*]] = gpu.thread_id x + // FOREACH-TO-GPU: %[[TIDY:.*]] = gpu.thread_id y + // + // Fill is tiled by 5x1 with thread_dim_mapping = [1, 0, 2], predicate appropriately. + // FOREACH-TO-GPU: %[[LT1:.*]] = arith.cmpi ult, %[[TIDX]], %[[C1]] : index + // FOREACH-TO-GPU: %[[LT5:.*]] = arith.cmpi ult, %[[TIDY]], %[[C5]] : index + // FOREACH-TO-GPU: %[[COND:.*]] = arith.andi %[[LT1]], %[[LT5]] : i1 + // FOREACH-TO-GPU: scf.if %[[COND]] { + // FOREACH-TO-GPU: affine.apply #{{.*}}()[%[[TIDY]]] + // FOREACH-TO-GPU: affine.apply #{{.*}}()[%[[TIDX]]] + // FOREACH-TO-GPU: linalg.fill + // FOREACH-TO-GPU: } + // FOREACH-TO-GPU: gpu.barrier + // + // Matmul is tiled by 7x9 with identity (omitted) thread_dim_mapping, predicate appropriately. + // FOREACH-TO-GPU: %[[LT7:.*]] = arith.cmpi ult, %[[TIDX]], %[[C7]] : index + // FOREACH-TO-GPU: %[[LT9:.*]] = arith.cmpi ult, %[[TIDY]], %[[C9]] : index + // FOREACH-TO-GPU: %[[COND2:.*]] = arith.andi %[[LT7]], %[[LT9]] : i1 + // FOREACH-TO-GPU: scf.if %[[COND2]] { + // FOREACH-TO-GPU: affine.min #{{.*}}()[%[[TIDX]]] + // FOREACH-TO-GPU: affine.min #{{.*}}()[%[[TIDY]]] + // FOREACH-TO-GPU-DAG: affine.apply #{{.*}}()[%[[TIDX]]] + // FOREACH-TO-GPU-DAG: %[[svA:.*]] = memref.subview {{.*}} : memref<250x500xf32{{.*}}> to memref to memref<500x?xf32 + // FOREACH-TO-GPU-DAG: %[[svC:.*]] = memref.subview {{.*}} : memref<250x1020xf32{{.*}}> to memref, memref<500x?xf32{{.*}}>) outs(%[[svC]] : memref) + // FOREACH-TO-GPU: } + // FOREACH-TO-GPU: gpu.barrier - %7 = linalg.matmul ins(%3, %4 : tensor<250x500xf32>, tensor<500x1020xf32>) outs(%6 : tensor<250x1020xf32>) -> tensor<250x1020xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [250, 1020], strides = [1, 1] : tensor<250x1020xf32> -> !flow.dispatch.tensor> - return - } + %7 = linalg.matmul ins(%3, %4 : tensor<250x500xf32>, tensor<500x1020xf32>) outs(%6 : tensor<250x1020xf32>) -> tensor<250x1020xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [250, 1020], strides = [1, 1] : tensor<250x1020xf32> -> !flow.dispatch.tensor> + return } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/llvmgpu_bufferize.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/llvmgpu_bufferize.mlir index b88f76ac1ab3b..1082caf638274 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/llvmgpu_bufferize.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/llvmgpu_bufferize.mlir @@ -1,32 +1,36 @@ // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-llvmgpu-bufferization-pipeline))" --split-input-file %s | FileCheck %s -module { - func.func @bufferize_with_thread_private_memory(%arg0: index) { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %cst_ved = arith.constant dense<0.000000e+00> : vector<1x1x4x4xf16> - %0 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg0, %arg0, %arg0], sizes = [1, 1, 8, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x1x8x64xf16> - %3 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [1], strides = [1] : !flow.dispatch.tensor> -> tensor<1xf16> - %4 = scf.forall (%arg1, %arg2) in (2, 16) shared_outs(%arg3 = %2) -> (tensor<1x1x8x64xf16>) { - %5 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1) - %6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2) - %extracted_slice = tensor.extract_slice %arg3[0, 0, %5, %6] [1, 1, 4, 4] [1, 1, 1, 1] : tensor<1x1x8x64xf16> to tensor<1x1x4x4xf16> - %alloc_tensor = bufferization.alloc_tensor() : tensor<1x1x4x4xf16> - %copy = bufferization.materialize_in_destination %extracted_slice in %alloc_tensor : (tensor<1x1x4x4xf16>, tensor<1x1x4x4xf16>) -> tensor<1x1x4x4xf16> - %7 = vector.transfer_read %3[%c0], %cst {in_bounds = [true]} : tensor<1xf16>, vector<1xf16> - %8 = vector.broadcast %7 : vector<1xf16> to vector<1x1x4x4xf16> - %9 = vector.transfer_read %arg3[%c0, %c0, %5, %6], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x8x64xf16>, vector<1x1x4x4xf16> - %10 = arith.addf %9, %8 : vector<1x1x4x4xf16> - %11 = vector.transfer_write %10, %copy[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x4x4xf16>, tensor<1x1x4x4xf16> - scf.forall.in_parallel { - tensor.parallel_insert_slice %11 into %arg3[0, 0, %5, %6] [1, 1, 4, 4] [1, 1, 1, 1] : tensor<1x1x4x4xf16> into tensor<1x1x8x64xf16> - } - } {mapping = [#gpu.thread, #gpu.thread]} - flow.dispatch.tensor.store %4, %1, offsets = [%arg0, %arg0, %arg0, %arg0], sizes = [1, 1, 8, 64], strides = [1, 1, 1, 1] : tensor<1x1x8x64xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> +func.func @bufferize_with_thread_private_memory(%arg0: index) { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %cst_ved = arith.constant dense<0.000000e+00> : vector<1x1x4x4xf16> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg0, %arg0, %arg0], sizes = [1, 1, 8, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x1x8x64xf16> + %3 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [1], strides = [1] : !flow.dispatch.tensor> -> tensor<1xf16> + %4 = scf.forall (%arg1, %arg2) in (2, 16) shared_outs(%arg3 = %2) -> (tensor<1x1x8x64xf16>) { + %5 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1) + %6 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2) + %extracted_slice = tensor.extract_slice %arg3[0, 0, %5, %6] [1, 1, 4, 4] [1, 1, 1, 1] : tensor<1x1x8x64xf16> to tensor<1x1x4x4xf16> + %alloc_tensor = bufferization.alloc_tensor() : tensor<1x1x4x4xf16> + %copy = bufferization.materialize_in_destination %extracted_slice in %alloc_tensor : (tensor<1x1x4x4xf16>, tensor<1x1x4x4xf16>) -> tensor<1x1x4x4xf16> + %7 = vector.transfer_read %3[%c0], %cst {in_bounds = [true]} : tensor<1xf16>, vector<1xf16> + %8 = vector.broadcast %7 : vector<1xf16> to vector<1x1x4x4xf16> + %9 = vector.transfer_read %arg3[%c0, %c0, %5, %6], %cst {in_bounds = [true, true, true, true]} : tensor<1x1x8x64xf16>, vector<1x1x4x4xf16> + %10 = arith.addf %9, %8 : vector<1x1x4x4xf16> + %11 = vector.transfer_write %10, %copy[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x4x4xf16>, tensor<1x1x4x4xf16> + scf.forall.in_parallel { + tensor.parallel_insert_slice %11 into %arg3[0, 0, %5, %6] [1, 1, 4, 4] [1, 1, 1, 1] : tensor<1x1x4x4xf16> into tensor<1x1x8x64xf16> + } + } {mapping = [#gpu.thread, #gpu.thread]} + flow.dispatch.tensor.store %4, %1, offsets = [%arg0, %arg0, %arg0, %arg0], sizes = [1, 1, 8, 64], strides = [1, 1, 1, 1] : tensor<1x1x8x64xf16> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @bufferize_with_thread_private_memory // CHECK: scf.forall {{.*}} in (2, 16) { diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_extract_address_computation.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_extract_address_computation.mlir index 3c58ddce47324..8ce12fb39f3e7 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_extract_address_computation.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_extract_address_computation.mlir @@ -85,9 +85,9 @@ hal.executable private @matmul_dispatch_0 { func.func @matmul_dispatch_0_matmul_2560x2560x2560() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2560, 2560], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2560x2560xf16> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2560, 2560], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2560x2560xf16> %5 = tensor.empty() : tensor<2560x2560xf16> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_mma_sync_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_mma_sync_pipeline_test.mlir index ad544cd2d9417..ab7136cd9ac75 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_mma_sync_pipeline_test.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_mma_sync_pipeline_test.mlir @@ -14,7 +14,7 @@ ]> hal.executable @mma_fused_fp16 { hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @_large_aligned_dispatch_0 ordinal(0) layout(#hal.pipeline.layout, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>) { + hal.executable.export public @_large_aligned_dispatch_0 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 hal.return %x, %y, %z : index, index, index @@ -25,10 +25,10 @@ hal.executable @mma_fused_fp16 { %cst = arith.constant 0.000000e+00 : f16 %c2048 = arith.constant 2048 : index %c512 = arith.constant 512 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %di = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %di = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x1024xf16> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] @@ -96,7 +96,7 @@ hal.executable @mma_fused_fp16 { ]> hal.executable @mma_fused_f32 { hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @_large_aligned_dispatch_0 ordinal(0) layout(#hal.pipeline.layout, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>]>]>) { + hal.executable.export public @_large_aligned_dispatch_0 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 hal.return %x, %y, %z : index, index, index @@ -107,10 +107,10 @@ hal.executable @mma_fused_f32 { %cst = arith.constant 0.000000e+00 : f32 %c2048 = arith.constant 2048 : index %c512 = arith.constant 512 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %di = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %di = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x1024xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_pipeline_test.mlir index 12725d6c1b509..39aec0fd1f064 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_pipeline_test.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_pipeline_test.mlir @@ -21,9 +21,9 @@ hal.executable @simpleMath_ex_dispatch_0 { builtin.module { func.func @add_dispatch_0() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %3 = tensor.empty() : tensor<16xf32> %4 = flow.dispatch.tensor.load %0, offsets=[0], sizes=[16], strides=[1] : !flow.dispatch.tensor> -> tensor<16xf32> %5 = flow.dispatch.tensor.load %1, offsets=[0], sizes=[16], strides=[1] : !flow.dispatch.tensor> -> tensor<16xf32> @@ -68,9 +68,9 @@ hal.executable @dot_dispatch_0 { %c0 = arith.constant 0 : index %c1024 = arith.constant 1024 : index %c1 = arith.constant 1 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x1024xf32> %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] @@ -139,9 +139,9 @@ hal.executable @dot_dispatch_0 { %c0 = arith.constant 0 : index %c1024 = arith.constant 1024 : index %c1 = arith.constant 1 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x1024xf32> %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] @@ -193,9 +193,9 @@ hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) { %c2 = arith.constant 2 : index %c3 = arith.constant 3 : index %c1 = arith.constant 1 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %11 = flow.dispatch.tensor.load %0, offsets = [0, 0 ,0, 0], sizes = [1, 4, 4, 2], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x4x4x2xf32> %13 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 2, 2, 1], strides = [1, 1, 1, 1] @@ -237,8 +237,8 @@ hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) { builtin.module { func.func @add_dispatch_0() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %3 = tensor.empty() : tensor<16xf32> %4 = flow.dispatch.tensor.load %0, offsets=[0], sizes=[16], strides=[1] : !flow.dispatch.tensor> -> tensor<16xf32> %5 = arith.constant dense<[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]> : tensor<16xf32> @@ -279,8 +279,8 @@ hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 %c96 = arith.constant 96 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [14, 14, 96], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<14x14x96xf32> %8 = tensor.empty() : tensor<96xf32> @@ -325,9 +325,9 @@ hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) { func.func @vector_add_dispatch() { %c0 = arith.constant 0 : index %c16384 = arith.constant 16384 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %6 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [16384], strides = [1] : !flow.dispatch.tensor> -> tensor<16384xf32> %8 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [16384], strides = [1] @@ -379,8 +379,8 @@ hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) { %c0 = arith.constant 0 : index %c16384 = arith.constant 16384 : index %cst = arith.constant 1.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 16384], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x16384xf32> %8 = tensor.empty() : tensor<16384xf32> @@ -426,10 +426,10 @@ hal.executable @mma_fused { %cst = arith.constant 0.000000e+00 : f32 %c2048 = arith.constant 2048 : index %c512 = arith.constant 512 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %di = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %di = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x1024xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] @@ -509,10 +509,10 @@ hal.executable @mma_fused_fp16 { %cst = arith.constant 0.000000e+00 : f16 %c2048 = arith.constant 2048 : index %c512 = arith.constant 512 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %di = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %di = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x1024xf16> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] @@ -597,11 +597,11 @@ hal.executable @mma_fused_fp16 { %c4 = arith.constant 4 : index %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0) + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(32) offset(%c0) : !flow.dispatch.tensor> %11 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 32, 1024], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x32x1024xf32> @@ -674,9 +674,9 @@ hal.executable @mma_fused_fp16 { func.func @split_k_gemm() { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2048, 4, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2048x4x256xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 256, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x256x512xf32> %5 = tensor.empty() : tensor<4x2048x512xf32> @@ -718,7 +718,7 @@ hal.executable @mma_fused_fp16 { // ----- -#pipeline_layout = #hal.pipeline.layout, #hal.descriptor_set.binding<1, storage_buffer>, @@ -738,10 +738,10 @@ hal.executable @mma_fused_fp16 { %c1_i64 = arith.constant 1 : i64 %c2_i64 = arith.constant 2 : i64 %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %s = arith.index_cast %0 : i32 to index - %14 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%s) : !flow.dispatch.tensor>{%s, %s, %s} - %15 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%s) : !flow.dispatch.tensor>{%s} + %14 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%s) : !flow.dispatch.tensor>{%s, %s, %s} + %15 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%s) : !flow.dispatch.tensor>{%s} %16 = flow.dispatch.tensor.load %14, offsets = [0, 0, 0, 0], sizes = [%s, 2048, %s, %s], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%s, %s, %s} -> tensor %19 = tensor.empty(%s) : tensor %38 = tensor.empty(%s, %s) : tensor @@ -783,8 +783,8 @@ hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) { %c0 = arith.constant 0 : index %c1024 = arith.constant 1024 : index %cst = arith.constant 1.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x1024xf32> %8 = tensor.empty() : tensor<512xf32> @@ -837,8 +837,8 @@ hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) { %c1024 = arith.constant 1024 : index %cst_0 = arith.constant 3.840000e+02 : f32 %cst = arith.constant 1.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x1024xf32> %8 = tensor.empty() : tensor<512xf32> @@ -896,8 +896,8 @@ hal.executable private @shared_mem_alloc { func.func @shared_mem_alloc() { %c0 = arith.constant 0 : index %cst = arith.constant dense<0xFF800000> : tensor<14x14x480xf32> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [29, 29, 480], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<29x29x480xf32> %3 = tensor.empty() : tensor<3x3xf32> %4 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0 * 2 + d3, d1 * 2 + d4, d2)>, affine_map<(d0, d1, d2, d3, d4) -> (d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]} ins(%2, %3 : tensor<29x29x480xf32>, tensor<3x3xf32>) outs(%cst : tensor<14x14x480xf32>) { @@ -946,8 +946,8 @@ hal.executable private @shared_mem_transpose { builtin.module { func.func @shared_mem_transpose() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 768], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x768xf32> %3 = tensor.empty() : tensor<768x2048xf32> %4 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<2048x768xf32>) outs(%3 : tensor<768x2048xf32>) { diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/pack_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/pack_pipeline_test.mlir index e1a1a69079709..bb741acd65e4d 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/pack_pipeline_test.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/pack_pipeline_test.mlir @@ -1,16 +1,20 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_60 --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s -module { - func.func @static_pack() { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xi32> - %3 = tensor.empty() : tensor<4x16x16x32xi32> - %pack = tensor.pack %2 inner_dims_pos = [1, 0] inner_tiles = [16, 32] into %3 : tensor<128x256xi32> -> tensor<4x16x16x32xi32> - flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [4, 16, 16, 32], strides = [1, 1, 1, 1] : tensor<4x16x16x32xi32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @static_pack() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xi32> + %3 = tensor.empty() : tensor<4x16x16x32xi32> + %pack = tensor.pack %2 inner_dims_pos = [1, 0] inner_tiles = [16, 32] into %3 : tensor<128x256xi32> -> tensor<4x16x16x32xi32> + flow.dispatch.tensor.store %pack, %1, offsets = [0, 0, 0, 0], sizes = [4, 16, 16, 32], strides = [1, 1, 1, 1] : tensor<4x16x16x32xi32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @static_pack // CHECK-NOT: vector.transfer_write diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/promote_matmul_to_fit_mma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/promote_matmul_to_fit_mma.mlir index 7d8bbc5983bf2..45eb7adda2966 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/promote_matmul_to_fit_mma.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/promote_matmul_to_fit_mma.mlir @@ -1,6 +1,13 @@ // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-llvmgpu-promote-matmul-to-fit-mma{target-dimensions=parallel}))" %s | FileCheck %s --check-prefixes=ALL,PARALLEL // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-llvmgpu-promote-matmul-to-fit-mma{target-dimensions=reduction}))" %s | FileCheck %s --check-prefixes=ALL,REDUCTION +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<()[s0] -> (s0 * 64)> #map1 = affine_map<()[s0] -> (s0 * 128)> #map2 = affine_map<()[s0] -> (s0 * -64 + 968, 64)> @@ -10,9 +17,9 @@ func.func @batch_matmul_f16() { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_z = hal.interface.workgroup.id[2] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %3 = affine.apply #map()[%workgroup_id_y] @@ -29,9 +36,9 @@ func.func @batch_matmul_f16() { return } // ALL-LABEL: func.func @batch_matmul_f16 -// ALL: %[[LHS_HANDLE:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> -// ALL: %[[RHS_HANDLE:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> -// ALL: %[[OUT_HANDLE:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> +// ALL: %[[LHS_HANDLE:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> +// ALL: %[[RHS_HANDLE:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> +// ALL: %[[OUT_HANDLE:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> // ALL-DAG: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_HANDLE]] // ALL-DAG: %[[RHS:.+]] = flow.dispatch.tensor.load %[[RHS_HANDLE]] // PARALLEL: %[[PADDED_LHS:.+]] = tensor.pad %[[LHS]] @@ -60,6 +67,13 @@ func.func @batch_matmul_f16() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<()[s0] -> (s0 * 64)> #map1 = affine_map<()[s0] -> (s0 * 128)> #map2 = affine_map<()[s0] -> (s0 * -64 + 968, 64)> @@ -74,9 +88,9 @@ func.func @batch_matmul_pad_reduction_after_tiling() { %c1 = arith.constant 1 : index %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_z = hal.interface.workgroup.id[2] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %3 = affine.apply #map()[%workgroup_id_y] @@ -114,9 +128,9 @@ func.func @batch_matmul_pad_reduction_after_tiling() { // The padding on parallel dims is a nop because they are already padded. Skip // the check for the testcase. // ALL-LABEL: func.func @batch_matmul_pad_reduction_after_tiling -// ALL: %[[LHS_HANDLE:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> -// ALL: %[[RHS_HANDLE:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> -// ALL: %[[OUT_HANDLE:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> +// ALL: %[[LHS_HANDLE:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> +// ALL: %[[RHS_HANDLE:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> +// ALL: %[[OUT_HANDLE:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> // ALL-DAG: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_HANDLE]] // ALL-DAG: %[[RHS:.+]] = flow.dispatch.tensor.load %[[RHS_HANDLE]] // REDUCTION: %[[INIT:.+]] = tensor.empty() : tensor<1x64x128xf16> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_cuda.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_cuda.mlir index 28a10a9a221ea..860513c98d655 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_cuda.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_cuda.mlir @@ -18,8 +18,8 @@ hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) { %c0 = arith.constant 0 : index %c10240 = arith.constant 10240 : index %cst = arith.constant 1.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 10240], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x10240xf32> %8 = tensor.empty() : tensor<512xf32> @@ -122,8 +122,8 @@ hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) { %c10240 = arith.constant 10240 : index %cst_0 = arith.constant 3.840000e+02 : f32 %cst = arith.constant 1.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x10240xf32> %8 = tensor.empty() : tensor<512xf32> @@ -215,8 +215,8 @@ hal.executable.variant @cuda target(<"cuda", "cuda-nvptx-fb">) { %cst = arith.constant -3.40282347E+38 : f32 %cst_0 = arith.constant 0.000000e+00 : f32 %cst_1 = arith.constant 1.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<12x128x40960xf32> %3 = tensor.empty() : tensor<12x128x40960xf32> %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir index 7ee8bb8517cd9..b890571091921 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_rocm.mlir @@ -1,20 +1,24 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 --pass-pipeline="builtin.module(func.func(iree-codegen-decompose-softmax), iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx940 --pass-pipeline="builtin.module(func.func(iree-codegen-decompose-softmax), iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s --check-prefix=CDNA3 -module { - func.func @softmax() { - %c0 = arith.constant 0 : index - %cst = arith.constant -3.40282347E+38 : f32 - %cst_0 = arith.constant 0.000000e+00 : f32 - %cst_1 = arith.constant 1.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<12x128x40960xf32> - %3 = tensor.empty() : tensor<12x128x40960xf32> - %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : tensor<12x128x40960xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @softmax() { + %c0 = arith.constant 0 : index + %cst = arith.constant -3.40282347E+38 : f32 + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant 1.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<12x128x40960xf32> + %3 = tensor.empty() : tensor<12x128x40960xf32> + %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : tensor<12x128x40960xf32> -> !flow.dispatch.tensor> + return } // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info @@ -24,20 +28,24 @@ module { // ----- -module { - func.func @softmax() { - %c0 = arith.constant 0 : index - %cst = arith.constant -3.40282347E+38 : f32 - %cst_0 = arith.constant 0.000000e+00 : f32 - %cst_1 = arith.constant 1.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<12x128x40960xf32> - %3 = tensor.empty() : tensor<12x128x40960xf32> - %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : tensor<12x128x40960xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @softmax() { + %c0 = arith.constant 0 : index + %cst = arith.constant -3.40282347E+38 : f32 + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant 1.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<12x128x40960xf32> + %3 = tensor.empty() : tensor<12x128x40960xf32> + %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : tensor<12x128x40960xf32> -> !flow.dispatch.tensor> + return } // On CDNA, we prefer wave64 with subgroup size 64. @@ -49,26 +57,30 @@ module { // ----- -module { - func.func @dynamic_softmax() { - %c32_i64 = arith.constant 32 : i64 - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = arith.extui %0 : i32 to i64 - %3 = arith.extui %1 : i32 to i64 - %4 = arith.shli %3, %c32_i64 : i64 - %5 = arith.ori %2, %4 : i64 - %6 = arith.index_castui %5 : i64 to index - %7 = flow.dispatch.workload.ordinal %6, 0 : index - %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%7} - %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%7} - %10 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [32, %7], strides = [1, 1] : !flow.dispatch.tensor>{%7} -> tensor<32x?xf16> - %11 = tensor.empty(%7) : tensor<32x?xf16> - %12 = linalg.softmax dimension(1) ins(%10 : tensor<32x?xf16>) outs(%11 : tensor<32x?xf16>) -> tensor<32x?xf16> - flow.dispatch.tensor.store %12, %9, offsets = [0, 0], sizes = [32, %7], strides = [1, 1] : tensor<32x?xf16> -> !flow.dispatch.tensor>{%7} - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @dynamic_softmax() { + %c32_i64 = arith.constant 32 : i64 + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = arith.extui %0 : i32 to i64 + %3 = arith.extui %1 : i32 to i64 + %4 = arith.shli %3, %c32_i64 : i64 + %5 = arith.ori %2, %4 : i64 + %6 = arith.index_castui %5 : i64 to index + %7 = flow.dispatch.workload.ordinal %6, 0 : index + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%7} + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%7} + %10 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [32, %7], strides = [1, 1] : !flow.dispatch.tensor>{%7} -> tensor<32x?xf16> + %11 = tensor.empty(%7) : tensor<32x?xf16> + %12 = linalg.softmax dimension(1) ins(%10 : tensor<32x?xf16>) outs(%11 : tensor<32x?xf16>) -> tensor<32x?xf16> + flow.dispatch.tensor.store %12, %9, offsets = [0, 0], sizes = [32, %7], strides = [1, 1] : tensor<32x?xf16> -> !flow.dispatch.tensor>{%7} + return } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_cuda.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_cuda.mlir index cf83253dae30c..2e3f01c256831 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_cuda.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_cuda.mlir @@ -1,8 +1,14 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_60 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, iree-codegen-lower-executable-using-transform-dialect, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable @small_reduction { hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @small_reduction ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer>]>]>) { + hal.executable.export public @small_reduction ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 hal.return %x, %y, %z : index, index, index @@ -11,8 +17,8 @@ hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { func.func @small_reduction() { %c0 = arith.constant 0 : index %cst = arith.constant -0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 13], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x13xf32> %3 = tensor.empty() : tensor<1024xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<1024xf32>) -> tensor<1024xf32> @@ -46,9 +52,15 @@ hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable @group_reduction { hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @group_reduction ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer>]>]>) { + hal.executable.export public @group_reduction ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 hal.return %x, %y, %z : index, index, index @@ -57,8 +69,8 @@ hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { func.func @group_reduction() { %c0 = arith.constant 0 : index %cst = arith.constant -0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [8, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<8x64xf32> %3 = tensor.empty() : tensor<8xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<8xf32>) -> tensor<8xf32> @@ -109,9 +121,15 @@ hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable @group_elementwise_reduction_elementwise { hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @group_elementwise_reduction_elementwise ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer>]>]>) { + hal.executable.export public @group_elementwise_reduction_elementwise ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1 hal.return %x, %y, %z : index, index, index @@ -120,8 +138,8 @@ hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { func.func @group_elementwise_reduction_elementwise() { %c0 = arith.constant 0 : index %cst = arith.constant -0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [8, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<8x64xf32> %3 = tensor.empty() : tensor<8xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<8xf32>) -> tensor<8xf32> @@ -180,9 +198,15 @@ hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable @group_reduction_larger { hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @group_reduction_larger ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer>]>]>) { + hal.executable.export public @group_reduction_larger ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 hal.return %x, %y, %z : index, index, index @@ -191,8 +215,8 @@ hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { func.func @group_reduction_larger() { %c0 = arith.constant 0 : index %cst = arith.constant -0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [33, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<33x1024xf32> %3 = tensor.empty() : tensor<33xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<33xf32>) -> tensor<33xf32> @@ -244,9 +268,15 @@ hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable @group_reduction_1d { hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @group_reduction_1d ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer>]>]>) { + hal.executable.export public @group_reduction_1d ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 hal.return %x, %y, %z : index, index, index @@ -255,8 +285,8 @@ hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { func.func @group_reduction_1d() { %c0 = arith.constant 0 : index %cst = arith.constant -0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [64], strides = [1] : !flow.dispatch.tensor> -> tensor<64xf32> %3 = tensor.empty() : tensor %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor) -> tensor @@ -277,9 +307,15 @@ hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable @group_elementwise_reduction_elementwise_4d { hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @group_elementwise_reduction_elementwise_4d ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer>]>]>) { + hal.executable.export public @group_elementwise_reduction_elementwise_4d ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3 hal.return %x, %y, %z : index, index, index @@ -288,8 +324,8 @@ hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { func.func @group_elementwise_reduction_elementwise_4d() { %c0 = arith.constant 0 : index %cst = arith.constant -0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 4, 8, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x4x8x64xf32> %3 = tensor.empty() : tensor<2x4x8xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2x4x8xf32>) -> tensor<2x4x8xf32> @@ -319,9 +355,15 @@ hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable @group_reduction_i8_12345 { hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @group_reduction_i8_12345 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer>]>]>) { + hal.executable.export public @group_reduction_i8_12345 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 hal.return %x, %y, %z : index, index, index @@ -330,8 +372,8 @@ hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { func.func @group_reduction_i8_12345() { %c0 = arith.constant 0 : index %cst = arith.constant 0 : i8 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [8, 12345], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<8x12345xi8> %3 = tensor.empty() : tensor<8x12345xi8> %4 = tensor.empty() : tensor<8xi8> @@ -398,11 +440,15 @@ hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb"> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> -#pipeline_layout = #hal.pipeline.layout, <1, storage_buffer>]>]> - hal.executable @reduction_2d_trailing_elementwise_static_dispatch_0 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @reduction_2d_trailing_elementwise_static_dispatch_0_generic_128x10_f32 ordinal(0) layout(#pipeline_layout) { @@ -414,8 +460,8 @@ hal.executable @reduction_2d_trailing_elementwise_static_dispatch_0 { func.func @reduction_2d_trailing_elementwise_static_dispatch_0_generic_128x10_f32() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 10], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x10xf32> %3 = tensor.empty() : tensor<128x10xf32> %4 = tensor.empty() : tensor<128xf32> @@ -463,9 +509,18 @@ hal.executable @reduction_2d_trailing_elementwise_static_dispatch_0 { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer> + ]> +]> hal.executable private @i4_dequant_matvec { hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @i4_dequant_matvec ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, ReadOnly>, <2, storage_buffer, ReadOnly>, <3, storage_buffer, ReadOnly>, <4, storage_buffer>]>]>) { + hal.executable.export public @i4_dequant_matvec ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index @@ -474,11 +529,11 @@ hal.executable private @i4_dequant_matvec { func.func @i4_dequant_matvec() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor> %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x32x128xi4> %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_rocm.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_rocm.mlir index 3fad5724b2668..cfa16f79f2c02 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_rocm.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/reduction_pipeline_transform_rocm.mlir @@ -5,9 +5,15 @@ // RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" \ // RUN: %s | FileCheck %s --check-prefix=CDNA3 +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable @group_reduction_1d { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @group_reduction_1d ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer>]>]>) { + hal.executable.export public @group_reduction_1d ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 hal.return %x, %y, %z : index, index, index @@ -16,8 +22,8 @@ hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { func.func @group_reduction_1d() { %c0 = arith.constant 0 : index %cst = arith.constant -0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [64], strides = [1] : !flow.dispatch.tensor> -> tensor<64xf32> %3 = tensor.empty() : tensor %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor) -> tensor @@ -40,9 +46,15 @@ hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable @group_reduction_1d { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @group_reduction_1d ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer>]>]>) { + hal.executable.export public @group_reduction_1d ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 hal.return %x, %y, %z : index, index, index @@ -51,8 +63,8 @@ hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { func.func @group_reduction_1d() { %c0 = arith.constant 0 : index %cst = arith.constant -0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [64], strides = [1] : !flow.dispatch.tensor> -> tensor<64xf32> %3 = tensor.empty() : tensor %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor) -> tensor @@ -76,9 +88,18 @@ hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer> + ]> +]> hal.executable private @i4_dequant_matvec { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @i4_dequant_matvec ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, ReadOnly>, <2, storage_buffer, ReadOnly>, <3, storage_buffer, ReadOnly>, <4, storage_buffer>]>]>) { + hal.executable.export public @i4_dequant_matvec ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index @@ -87,11 +108,11 @@ hal.executable private @i4_dequant_matvec { func.func @i4_dequant_matvec() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor> %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x32x128xi4> %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> @@ -144,9 +165,18 @@ hal.executable private @i4_dequant_matvec { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer> + ]> +]> hal.executable private @i4_dequant_matvec { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @i4_dequant_matvec ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, ReadOnly>, <2, storage_buffer, ReadOnly>, <3, storage_buffer, ReadOnly>, <4, storage_buffer>]>]>) { + hal.executable.export public @i4_dequant_matvec ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index @@ -155,11 +185,11 @@ hal.executable private @i4_dequant_matvec { func.func @i4_dequant_matvec() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor> %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x32x128xi4> %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x32xf16> @@ -194,9 +224,16 @@ hal.executable private @i4_dequant_matvec { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> hal.executable private @matvec_fp16 { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @matvec_fp16 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) { + hal.executable.export public @matvec_fp16 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index @@ -205,9 +242,9 @@ hal.executable private @matvec_fp16 { func.func @matvec_fp16() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x4096xf16> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32000x4096xf16> %5 = tensor.empty() : tensor<1x32000xf16> @@ -250,9 +287,16 @@ hal.executable private @matvec_fp16 { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> hal.executable private @matvec_fp16 { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { - hal.executable.export public @matvec_fp16 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) { + hal.executable.export public @matvec_fp16 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index @@ -261,9 +305,9 @@ hal.executable private @matvec_fp16 { func.func @matvec_fp16() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x4096xf16> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32000, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32000x4096xf16> %5 = tensor.empty() : tensor<1x32000xf16> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir index ba392f945e271..6a060af8a1b6b 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir @@ -22,9 +22,9 @@ hal.executable @simpleMath_ex_dispatch_0 { builtin.module { func.func @add_dispatch_0() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %3 = tensor.empty() : tensor<16xf32> %4 = flow.dispatch.tensor.load %0, offsets=[0], sizes=[16], strides=[1] : !flow.dispatch.tensor> -> tensor<16xf32> %5 = flow.dispatch.tensor.load %1, offsets=[0], sizes=[16], strides=[1] : !flow.dispatch.tensor> -> tensor<16xf32> @@ -69,9 +69,9 @@ hal.executable @dot_dispatch_0 { %c0 = arith.constant 0 : index %c1024 = arith.constant 1024 : index %c1 = arith.constant 1 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x1024xf32> %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] @@ -123,9 +123,9 @@ hal.executable @ext_fp8_dispatch { builtin.module { func.func @ext_fp8_dispatch() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4096], strides = [1] : !flow.dispatch.tensor> -> tensor<4096xf8E4M3FNUZ> %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4096], strides = [1] : !flow.dispatch.tensor> -> tensor<4096xf8E5M2FNUZ> %5 = tensor.empty() : tensor<4096xf32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir index 274ec1dd55437..186fcf58b4bc3 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir @@ -14,29 +14,34 @@ // RUN: -td-matmul-strategy-use-fma=true \ // RUN: | FileCheck %s --check-prefixes=CHECK,OPTIONS +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> -module { - func.func @batch_matmul_dispatch_0_generic_128x80x320x32_f32() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [128, 80, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x80x32xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [128, 32, 320], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x32x320xf32> - %5 = tensor.empty() : tensor<128x80x320xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x80x320xf32>) -> tensor<128x80x320xf32> - %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<128x80x32xf32>, tensor<128x32x320xf32>) outs(%6 : tensor<128x80x320xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %8 = arith.mulf %in, %in_0 : f32 - %9 = arith.addf %out, %8 : f32 - linalg.yield %9 : f32 - } -> tensor<128x80x320xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [128, 80, 320], strides = [1, 1, 1] : tensor<128x80x320xf32> -> !flow.dispatch.tensor> - return - } +func.func @batch_matmul_dispatch_0_generic_128x80x320x32_f32() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [128, 80, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x80x32xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [128, 32, 320], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x32x320xf32> + %5 = tensor.empty() : tensor<128x80x320xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x80x320xf32>) -> tensor<128x80x320xf32> + %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<128x80x32xf32>, tensor<128x32x320xf32>) outs(%6 : tensor<128x80x320xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %8 = arith.mulf %in, %in_0 : f32 + %9 = arith.addf %out, %8 : f32 + linalg.yield %9 : f32 + } -> tensor<128x80x320xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [128, 80, 320], strides = [1, 1, 1] : tensor<128x80x320xf32> -> !flow.dispatch.tensor> + return } // CHECK: transform.named_sequence diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir index cc7d47eaafb9f..1f7bf8a7ae7cd 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir @@ -1,21 +1,26 @@ // RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \ // RUN: --iree-gpu-test-target=sm_80 --iree-codegen-llvmgpu-enable-transform-dialect-implicit-gemm-strategy | FileCheck %s -module { - func.func @nchw_convolution() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [8, 128, 258, 258], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x128x258x258xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [256, 128, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<256x128x3x3xf32> - %5 = tensor.empty() : tensor<8x256x256x256xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32> - %7 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<8x128x258x258xf32>, tensor<256x128x3x3xf32>) outs(%6 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [8, 256, 256, 256], strides = [1, 1, 1, 1] : tensor<8x256x256x256xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @nchw_convolution() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [8, 128, 258, 258], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x128x258x258xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [256, 128, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<256x128x3x3xf32> + %5 = tensor.empty() : tensor<8x256x256x256xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32> + %7 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<8x128x258x258xf32>, tensor<256x128x3x3xf32>) outs(%6 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [8, 256, 256, 256], strides = [1, 1, 1, 1] : tensor<8x256x256x256xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func @nchw_convolution @@ -62,21 +67,26 @@ module { // ----- -module { - func.func @nhwc_convolution() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [8, 258, 258, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x258x258x128xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 128, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x128x256xf32> - %5 = tensor.empty() : tensor<8x256x256x256xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32> - %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<8x258x258x128xf32>, tensor<3x3x128x256xf32>) outs(%6 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [8, 256, 256, 256], strides = [1, 1, 1, 1] : tensor<8x256x256x256xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @nhwc_convolution() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [8, 258, 258, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x258x258x128xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 128, 256], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x128x256xf32> + %5 = tensor.empty() : tensor<8x256x256x256xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32> + %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<8x258x258x128xf32>, tensor<3x3x128x256xf32>) outs(%6 : tensor<8x256x256x256xf32>) -> tensor<8x256x256x256xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [8, 256, 256, 256], strides = [1, 1, 1, 1] : tensor<8x256x256x256xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func @nhwc_convolution @@ -97,21 +107,26 @@ module { // ----- -module { - func.func @unaligned_convolution() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [8, 258, 258, 132], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x258x258x132xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 132, 264], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x132x264xf32> - %5 = tensor.empty() : tensor<8x256x256x264xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<8x256x256x264xf32>) -> tensor<8x256x256x264xf32> - %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<8x258x258x132xf32>, tensor<3x3x132x264xf32>) outs(%6 : tensor<8x256x256x264xf32>) -> tensor<8x256x256x264xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [8, 256, 256, 264], strides = [1, 1, 1, 1] : tensor<8x256x256x264xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @unaligned_convolution() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [8, 258, 258, 132], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x258x258x132xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 132, 264], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x132x264xf32> + %5 = tensor.empty() : tensor<8x256x256x264xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<8x256x256x264xf32>) -> tensor<8x256x256x264xf32> + %7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %4 : tensor<8x258x258x132xf32>, tensor<3x3x132x264xf32>) outs(%6 : tensor<8x256x256x264xf32>) -> tensor<8x256x256x264xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [8, 256, 256, 264], strides = [1, 1, 1, 1] : tensor<8x256x256x264xf32> -> !flow.dispatch.tensor> + return } // CHECK: #iree_codegen.translation_info diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir index 6b7a14cb7c96e..887da83c043eb 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir @@ -43,21 +43,26 @@ // RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \ // RUN: --iree-gpu-test-target=sm_80 --iree-codegen-llvmgpu-enable-transform-dialect-small-matmul | FileCheck --check-prefix=SMALL %s -module { - func.func @matmul_1() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2052, 2556], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2052x2556xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2556x2052xf32> - %5 = tensor.empty() : tensor<2052x2052xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<2052x2556xf32>, tensor<2556x2052xf32>) outs(%6 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2052, 2052], strides = [1, 1] : tensor<2052x2052xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_1() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2052, 2556], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2052x2556xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2556x2052xf32> + %5 = tensor.empty() : tensor<2052x2052xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<2052x2556xf32>, tensor<2556x2052xf32>) outs(%6 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2052, 2052], strides = [1, 1] : tensor<2052x2052xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func @matmul_1 @@ -200,21 +205,26 @@ module { // ----- -module { - func.func @matmul_2() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2051, 2555], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2051x2555xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2555, 2051], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2555x2050xf32> - %5 = tensor.empty() : tensor<2051x2050xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2051x2050xf32>) -> tensor<2051x2050xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<2051x2555xf32>, tensor<2555x2050xf32>) outs(%6 : tensor<2051x2050xf32>) -> tensor<2051x2050xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2051, 2050], strides = [1, 1] : tensor<2051x2050xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_2() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2051, 2555], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2051x2555xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2555, 2051], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2555x2050xf32> + %5 = tensor.empty() : tensor<2051x2050xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2051x2050xf32>) -> tensor<2051x2050xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<2051x2555xf32>, tensor<2555x2050xf32>) outs(%6 : tensor<2051x2050xf32>) -> tensor<2051x2050xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2051, 2050], strides = [1, 1] : tensor<2051x2050xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func @matmul_2 @@ -245,21 +255,26 @@ module { // ----- -module { - func.func @matmul_3() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 2556], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x2556xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2556], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2556x2556xf32> - %5 = tensor.empty() : tensor<2048x2556xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x2556xf32>) -> tensor<2048x2556xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<2048x2556xf32>, tensor<2556x2556xf32>) outs(%6 : tensor<2048x2556xf32>) -> tensor<2048x2556xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2556], strides = [1, 1] : tensor<2048x2556xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_3() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 2556], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x2556xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2556], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2556x2556xf32> + %5 = tensor.empty() : tensor<2048x2556xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x2556xf32>) -> tensor<2048x2556xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<2048x2556xf32>, tensor<2556x2556xf32>) outs(%6 : tensor<2048x2556xf32>) -> tensor<2048x2556xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2556], strides = [1, 1] : tensor<2048x2556xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func @matmul_3 @@ -272,21 +287,26 @@ module { // ----- -module { - func.func @matmul_4_partially_unaligned() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x2044xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2044x1024xf32> - %5 = tensor.empty() : tensor<2048x1024xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x1024xf32>) -> tensor<2048x1024xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<2048x2044xf32>, tensor<2044x1024xf32>) outs(%6 : tensor<2048x1024xf32>) -> tensor<2048x1024xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : tensor<2048x1024xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_4_partially_unaligned() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x2044xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2044x1024xf32> + %5 = tensor.empty() : tensor<2048x1024xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x1024xf32>) -> tensor<2048x1024xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<2048x2044xf32>, tensor<2044x1024xf32>) outs(%6 : tensor<2048x1024xf32>) -> tensor<2048x1024xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : tensor<2048x1024xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func @matmul_4_partially_unaligned @@ -335,21 +355,26 @@ module { // ----- -module { - func.func @aligned_matmul() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x2048xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x2048xf32> - %5 = tensor.empty() : tensor<2048x2048xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x2048xf32>) -> tensor<2048x2048xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<2048x2048xf32>, tensor<2048x2048xf32>) outs(%6 : tensor<2048x2048xf32>) -> tensor<2048x2048xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @aligned_matmul() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x2048xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x2048xf32> + %5 = tensor.empty() : tensor<2048x2048xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x2048xf32>) -> tensor<2048x2048xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<2048x2048xf32>, tensor<2048x2048xf32>) outs(%6 : tensor<2048x2048xf32>) -> tensor<2048x2048xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 2048], strides = [1, 1] : tensor<2048x2048xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func @aligned_matmul @@ -397,21 +422,26 @@ module { // ----- -module { - func.func @matmul_5_small() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 2044], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2x2044xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2044, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2044x1024xf32> - %5 = tensor.empty() : tensor<2x1024xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x1024xf32>) -> tensor<2x1024xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<2x2044xf32>, tensor<2044x1024xf32>) outs(%6 : tensor<2x1024xf32>) -> tensor<2x1024xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2, 1024], strides = [1, 1] : tensor<2x1024xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_5_small() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 2044], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2x2044xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2044, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2044x1024xf32> + %5 = tensor.empty() : tensor<2x1024xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x1024xf32>) -> tensor<2x1024xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<2x2044xf32>, tensor<2044x1024xf32>) outs(%6 : tensor<2x1024xf32>) -> tensor<2x1024xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2, 1024], strides = [1, 1] : tensor<2x1024xf32> -> !flow.dispatch.tensor> + return } // CHECK: iree_codegen.translation_info @@ -431,21 +461,26 @@ module { // ----- -module { - func.func @f16_matmul() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2052, 2556], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2052x2556xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2556x2052xf16> - %5 = tensor.empty() : tensor<2052x2052xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2052x2052xf16>) -> tensor<2052x2052xf16> - %7 = linalg.matmul ins(%3, %4 : tensor<2052x2556xf16>, tensor<2556x2052xf16>) outs(%6 : tensor<2052x2052xf16>) -> tensor<2052x2052xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2052, 2052], strides = [1, 1] : tensor<2052x2052xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @f16_matmul() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2052, 2556], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2052x2556xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2556x2052xf16> + %5 = tensor.empty() : tensor<2052x2052xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2052x2052xf16>) -> tensor<2052x2052xf16> + %7 = linalg.matmul ins(%3, %4 : tensor<2052x2556xf16>, tensor<2556x2052xf16>) outs(%6 : tensor<2052x2052xf16>) -> tensor<2052x2052xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2052, 2052], strides = [1, 1] : tensor<2052x2052xf16> -> !flow.dispatch.tensor> + return } // CHECK: iree_codegen.translation_info @@ -459,21 +494,26 @@ module { // ----- -module { - func.func @int8_matmul() { - %c0 = arith.constant 0 : index - %c0_i8 = arith.constant 0 : i8 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4, 2556], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4x2556xi8> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2556x2052xi8> - %5 = tensor.empty() : tensor<4x2052xi8> - %6 = linalg.fill ins(%c0_i8 : i8) outs(%5 : tensor<4x2052xi8>) -> tensor<4x2052xi8> - %7 = linalg.matmul ins(%3, %4 : tensor<4x2556xi8>, tensor<2556x2052xi8>) outs(%6 : tensor<4x2052xi8>) -> tensor<4x2052xi8> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [4, 2052], strides = [1, 1] : tensor<4x2052xi8> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @int8_matmul() { + %c0 = arith.constant 0 : index + %c0_i8 = arith.constant 0 : i8 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4, 2556], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4x2556xi8> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2556x2052xi8> + %5 = tensor.empty() : tensor<4x2052xi8> + %6 = linalg.fill ins(%c0_i8 : i8) outs(%5 : tensor<4x2052xi8>) -> tensor<4x2052xi8> + %7 = linalg.matmul ins(%3, %4 : tensor<4x2556xi8>, tensor<2556x2052xi8>) outs(%6 : tensor<4x2052xi8>) -> tensor<4x2052xi8> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [4, 2052], strides = [1, 1] : tensor<4x2052xi8> -> !flow.dispatch.tensor> + return } // SMALL-LABEL: func @int8_matmul diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir index c94461ffcd9f4..7bc1965aa39c5 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_pad.mlir @@ -16,22 +16,26 @@ // RUN: --td-pad-strategy-use-async-copies=false \ // RUN: | FileCheck --check-prefix=WITH_OPTIONS %s -module { - func.func @pad() { - %c0 = arith.constant 0 : index - %c56 = arith.constant 56 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [123, 456], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<123x456xf32> - %cst_0 = arith.constant 0.000000e+00 : f32 - %padded = tensor.pad %2 low[%c0, 0] high[5, %c56] { - ^bb0(%arg0: index, %arg1: index): - tensor.yield %cst_0 : f32 - } : tensor<123x456xf32> to tensor<128x512xf32> - flow.dispatch.tensor.store %padded, %1, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @pad() { + %c0 = arith.constant 0 : index + %c56 = arith.constant 56 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [123, 456], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<123x456xf32> + %cst_0 = arith.constant 0.000000e+00 : f32 + %padded = tensor.pad %2 low[%c0, 0] high[5, %c56] { + ^bb0(%arg0: index, %arg1: index): + tensor.yield %cst_0 : f32 + } : tensor<123x456xf32> to tensor<128x512xf32> + flow.dispatch.tensor.store %padded, %1, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func @pad @@ -94,21 +98,25 @@ module { // ----- -module { - func.func @pad_low() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [123, 456], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<123x456xf32> - %cst_0 = arith.constant 0.000000e+00 : f32 - %padded = tensor.pad %2 low[5, 0] high[0, 56] { - ^bb0(%arg0: index, %arg1: index): - tensor.yield %cst_0 : f32 - } : tensor<123x456xf32> to tensor<128x512xf32> - flow.dispatch.tensor.store %padded, %1, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @pad_low() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [123, 456], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<123x456xf32> + %cst_0 = arith.constant 0.000000e+00 : f32 + %padded = tensor.pad %2 low[5, 0] high[0, 56] { + ^bb0(%arg0: index, %arg1: index): + tensor.yield %cst_0 : f32 + } : tensor<123x456xf32> to tensor<128x512xf32> + flow.dispatch.tensor.store %padded, %1, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor> + return } // The strategy doesn't apply for low padding. @@ -119,22 +127,26 @@ module { // ----- -module { - func.func @pad_local() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [123, 456], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<123x456xf32> - %padded = tensor.pad %2 low[0, 0] high[5, 56] { - ^bb0(%arg0: index, %arg1: index): - %3 = arith.index_cast %arg0 : index to i64 - %4 = arith.uitofp %3 : i64 to f32 - tensor.yield %4 : f32 - } : tensor<123x456xf32> to tensor<128x512xf32> - flow.dispatch.tensor.store %padded, %1, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @pad_local() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [123, 456], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<123x456xf32> + %padded = tensor.pad %2 low[0, 0] high[5, 56] { + ^bb0(%arg0: index, %arg1: index): + %3 = arith.index_cast %arg0 : index to i64 + %4 = arith.uitofp %3 : i64 to f32 + tensor.yield %4 : f32 + } : tensor<123x456xf32> to tensor<128x512xf32> + flow.dispatch.tensor.store %padded, %1, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : tensor<128x512xf32> -> !flow.dispatch.tensor> + return } // The strategy doesn't apply for local pad values. diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensor_pad.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensor_pad.mlir index d815f93343b25..01904e18a7c94 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensor_pad.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensor_pad.mlir @@ -1,11 +1,17 @@ // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-llvmgpu-tensor-pad),fold-memref-alias-ops,canonicalize,cse)" %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @transpose_no_align_dispatch_0_generic_48x32() { %c48 = arith.constant 48 : index %c32 = arith.constant 32 : index %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index @@ -34,8 +40,8 @@ func.func @transpose_no_align_dispatch_0_generic_48x32() { // CHECK: %[[C48:.*]] = arith.constant 48 : index // CHECK: %[[C32:.*]] = arith.constant 32 : index // CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[D0:.*]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%[[C0]]) : !flow.dispatch.tensor> -// CHECK: %[[D1:.*]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%[[C0]]) : !flow.dispatch.tensor> +// CHECK: %[[D0:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%[[C0]]) : !flow.dispatch.tensor> +// CHECK: %[[D1:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%[[C0]]) : !flow.dispatch.tensor> // CHECK: %[[WORKGROUP_ID_X:.*]] = hal.interface.workgroup.id[0] : index // CHECK: %[[WORKGROUP_COUNT_X:.*]] = hal.interface.workgroup.count[0] : index // CHECK: %[[WORKGROUP_ID_Y:.*]] = hal.interface.workgroup.id[1] : index @@ -67,6 +73,12 @@ func.func @transpose_no_align_dispatch_0_generic_48x32() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<()[s0] -> (s0 * 16)> #map1 = affine_map<(d0)[s0] -> (-d0 + s0, 16)> #map2 = affine_map<(d0) -> (d0 ceildiv 2)> @@ -74,16 +86,16 @@ func.func @transpose_no_align_dispatch_0_generic_48x32() { func.func @unpack_dynamic() { %c0 = arith.constant 0 : index %c64 = arith.constant 64 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 %4 = arith.index_castui %0 : i32 to index %5 = arith.index_castui %1 : i32 to index %6 = arith.index_castui %2 : i32 to index %7 = arith.index_castui %3 : i32 to index - %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c64) flags(ReadOnly) : !flow.dispatch.tensor>{%4, %5} - %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%6, %7} + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c64) flags(ReadOnly) : !flow.dispatch.tensor>{%4, %5} + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%6, %7} %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index @@ -113,7 +125,7 @@ func.func @unpack_dynamic() { return } // CHECK-LABEL: func.func @unpack_dynamic -// CHECK: %[[DEST_BUF:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[DEST_BUF:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[DEST_BUF]] // CHECK: %[[PAD:.+]] = tensor.pad %[[LOAD]] // CHECK: %[[UNPACK:.+]] = tensor.unpack {{.+}} into %[[PAD]] diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir index 18b5b88a38bb9..edc882be49de2 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/tensorcore_vectorization.mlir @@ -1,13 +1,20 @@ // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-llvmgpu-tensorcore-vectorization))" %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @dot() { %c16 = arith.constant 16 : index %c1024 = arith.constant 1024 : index %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<2048x1024xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<1024x512xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<2048x512xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<2048x1024xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<1024x512xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<2048x512xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %3 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_y] diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir index 82f73e62ceee0..25e19a8efdaf1 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_bufferize.mlir @@ -1,10 +1,17 @@ // RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @pad_matmul_static_dispatch_0() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [250, 500], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<250x500xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [500, 1020], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<500x1020xf32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_promote_operands.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_promote_operands.mlir index 954791f760cfe..024c901d0b3a1 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_promote_operands.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_promote_operands.mlir @@ -1,11 +1,17 @@ // RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable private @pad_matmul_static_dispatch_0 { builtin.module { func.func @pad_matmul_static_dispatch_0(%arg0: tensor<250x500xf32>, %arg1: tensor<500x1020xf32>) -> tensor<250x1020xf32> { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [250, 500], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<250x500xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [500, 1020], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<500x1020xf32> @@ -13,8 +19,8 @@ hal.executable private @pad_matmul_static_dispatch_0 { %cst = arith.constant 0.000000e+00 : f32 %5 = linalg.fill ins(%cst : f32) outs(%50 : tensor<250x1020xf32>) -> tensor<250x1020xf32> // CHECK: %[[CST:.+]] = arith.constant 0.000000e+00 : f32 - // CHECK: %[[D0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) - // CHECK: %[[D1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) + // CHECK: %[[D0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) + // CHECK: %[[D1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) // CHECK: %[[D2:.+]] = flow.dispatch.tensor.load %[[D0]], offsets = [0, 0], sizes = [250, 500] // CHECK: %[[D3:.+]] = flow.dispatch.tensor.load %[[D1]], offsets = [0, 0], sizes = [500, 1020] // CHECK: %[[D4:.+]] = tensor.empty() : tensor<250x1020xf32> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir index 0b2de2966ab44..91536b7f5594a 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_dialect_vector_distribution.mlir @@ -8,46 +8,49 @@ // RUN: --allow-unregistered-dialect | \ // RUN: FileCheck %s +#pipeline_layout = #hal.pipeline.layout + ]> +]> #translation_info = #iree_codegen.translation_info -module { - func.func @reduce_dispatch_0() attributes {translation_info = #translation_info} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %0 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<128xf32> - memref.assume_alignment %0, 64 : memref<128xf32> - %1 = gpu.thread_id x - %2 = arith.cmpi ult, %1, %c1 : index +func.func @reduce_dispatch_0() attributes {translation_info = #translation_info} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<128xf32> + memref.assume_alignment %0, 64 : memref<128xf32> + %1 = gpu.thread_id x + %2 = arith.cmpi ult, %1, %c1 : index - // WARP-EXECUTE-DAG: %[[C0:.*]] = arith.constant 0 : index - // WARP-EXECUTE-DAG: %[[C32:.*]] = arith.constant 32 : index - // WARP-EXECUTE: %[[TIDX:.*]] = gpu.thread_id x - // WARP-EXECUTE: %[[COND32:.*]] = arith.cmpi ult, %[[TIDX]], %[[C32]] : index - // Single-warp guard filters out threads 32-63. - // WARP-EXECUTE: scf.if %[[COND32]] { - // WARP-EXECUTE: vector.warp_execute_on_lane_0(%[[TIDX]])[32] { - // WARP-EXECUTE: %[[V:.*]] = "some_def"() : () -> vector<128xf32> - // WARP-EXECUTE: vector.transfer_write %[[V]], %{{.*}} {in_bounds = [true]} : vector<128xf32>, memref<128xf32> + // WARP-EXECUTE-DAG: %[[C0:.*]] = arith.constant 0 : index + // WARP-EXECUTE-DAG: %[[C32:.*]] = arith.constant 32 : index + // WARP-EXECUTE: %[[TIDX:.*]] = gpu.thread_id x + // WARP-EXECUTE: %[[COND32:.*]] = arith.cmpi ult, %[[TIDX]], %[[C32]] : index + // Single-warp guard filters out threads 32-63. + // WARP-EXECUTE: scf.if %[[COND32]] { + // WARP-EXECUTE: vector.warp_execute_on_lane_0(%[[TIDX]])[32] { + // WARP-EXECUTE: %[[V:.*]] = "some_def"() : () -> vector<128xf32> + // WARP-EXECUTE: vector.transfer_write %[[V]], %{{.*}} {in_bounds = [true]} : vector<128xf32>, memref<128xf32> - // CHECK-DAG: #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 4)> - // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index - // CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index - // CHECK: %[[TIDX:.*]] = gpu.thread_id x - // CHECK: %[[COND32:.*]] = arith.cmpi ult, %[[TIDX]], %[[C32]] : index - // Single-warp guard filters out threads 32-63. - // CHECK: scf.if %[[COND32]] { - // CHECK: %[[COND1:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index - // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<128xf32, #gpu.address_space> - // Single-thread guard runs on thread 0 only. - // CHECK: scf.if %[[COND1]] { - // CHECK: %[[V:.*]] = "some_def"() : () -> vector<128xf32> - // CHECK: vector.transfer_write %[[V]], %{{.*}} : vector<128xf32>, memref<128xf32, #gpu.address_space> - // CHECK: %[[IDX:.*]] = affine.apply #[[MAP]]()[%[[TIDX]]] - // CHECK: %[[LOADED:.*]] = vector.transfer_read %{{.*}}[%[[IDX]]], %{{.*}} {in_bounds = [true]} : memref<128xf32, #gpu.address_space>, vector<4xf32> - // CHECK: vector.transfer_write %[[LOADED]], %{{.*}} {in_bounds = [true]} : vector<4xf32>, memref<128xf32> - scf.if %2 { - %v = "some_def"() : () -> (vector<128xf32>) - vector.transfer_write %v, %0[%c0] : vector<128xf32>, memref<128xf32> - } - return + // CHECK-DAG: #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 4)> + // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index + // CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index + // CHECK: %[[TIDX:.*]] = gpu.thread_id x + // CHECK: %[[COND32:.*]] = arith.cmpi ult, %[[TIDX]], %[[C32]] : index + // Single-warp guard filters out threads 32-63. + // CHECK: scf.if %[[COND32]] { + // CHECK: %[[COND1:.*]] = arith.cmpi eq, %[[TIDX]], %[[C0]] : index + // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<128xf32, #gpu.address_space> + // Single-thread guard runs on thread 0 only. + // CHECK: scf.if %[[COND1]] { + // CHECK: %[[V:.*]] = "some_def"() : () -> vector<128xf32> + // CHECK: vector.transfer_write %[[V]], %{{.*}} : vector<128xf32>, memref<128xf32, #gpu.address_space> + // CHECK: %[[IDX:.*]] = affine.apply #[[MAP]]()[%[[TIDX]]] + // CHECK: %[[LOADED:.*]] = vector.transfer_read %{{.*}}[%[[IDX]]], %{{.*}} {in_bounds = [true]} : memref<128xf32, #gpu.address_space>, vector<4xf32> + // CHECK: vector.transfer_write %[[LOADED]], %{{.*}} {in_bounds = [true]} : vector<4xf32>, memref<128xf32> + scf.if %2 { + %v = "some_def"() : () -> (vector<128xf32>) + vector.transfer_write %v, %0[%c0] : vector<128xf32>, memref<128xf32> } + return } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir index c040e07b8f311..410266427633d 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_distribute_forall.mlir @@ -1,5 +1,10 @@ // RUN: iree-opt %s --pass-pipeline="builtin.module(iree-codegen-lower-executable-using-transform-dialect)" | FileCheck %s +#pipeline_layout = #hal.pipeline.layout + ]> +]> #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb"> #translation = #iree_codegen.translation_info module { @@ -8,7 +13,7 @@ module { %c250 = arith.constant 250 : index %c8 = arith.constant 8 : index %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2xf16> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<2xf16> memref.assume_alignment %0, 64 : memref<2xf16> %workgroup_id_x = hal.interface.workgroup.id[0] : index %subview = memref.subview %0[%workgroup_id_x] [1] [1] : memref<2xf16> to memref<1xf16, strided<[1], offset: ?>> diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_gpu_pipelining.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_gpu_pipelining.mlir index f1d2b4c81a5af..57f8c8293506c 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_gpu_pipelining.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_gpu_pipelining.mlir @@ -1,5 +1,12 @@ // RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> hal.executable private @matmul_pipelining { builtin.module { func.func @matmul_pipelining() { @@ -14,11 +21,11 @@ func.func @matmul_pipelining() { %3 = gpu.thread_id z %4 = memref.alloc() : memref<4x32x40xf16, 3> %5 = memref.alloc() : memref<4x32x40xf16, 3> - %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<3456x2048xf16> + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<3456x2048xf16> memref.assume_alignment %6, 64 : memref<3456x2048xf16> - %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<2048x1024xf16> + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<2048x1024xf16> memref.assume_alignment %7, 64 : memref<2048x1024xf16> - %8 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<3456x1024xf16> + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<3456x1024xf16> memref.assume_alignment %8, 64 : memref<3456x1024xf16> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_vector_to_mma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_vector_to_mma.mlir index 82334e9327b90..2159e58ee50fb 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_vector_to_mma.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transform_vector_to_mma.mlir @@ -1,5 +1,12 @@ // RUN: iree-opt %s --split-input-file -iree-transform-dialect-interpreter -transform-dialect-drop-schedule | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> hal.executable private @matmul { builtin.module { // CHECK-LABEL: func.func @matmul @@ -10,11 +17,11 @@ func.func @matmul() { %c16 = arith.constant 16 : index %c32 = arith.constant 32 : index %cst_0 = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<32x32xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<32x32xf32> memref.assume_alignment %0, 64 : memref<32x32xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<32x32xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<32x32xf32> memref.assume_alignment %1, 64 : memref<32x32xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x32xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x32xf32> memref.assume_alignment %2, 64 : memref<32x32xf32> %3 = gpu.thread_id x %4 = gpu.thread_id y @@ -70,6 +77,13 @@ module attributes { transform.with_named_sequence } { // ----- // Verify that unrolling does not apply to rank 1 elementwise vector ops. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> hal.executable private @gathered_matmul { builtin.module { // CHECK-LABEL: func.func @gathered_matmul @@ -84,11 +98,11 @@ func.func @gathered_matmul() { %cst_0 = arith.constant 0.000000e+00 : f32 %cst_1 = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex> %cst_2 = arith.constant dense<1> : vector<4x4xindex> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<32x32xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<32x32xf32> memref.assume_alignment %0, 64 : memref<32x32xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<32x32xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<32x32xf32> memref.assume_alignment %1, 64 : memref<32x32xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x32xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x32xf32> memref.assume_alignment %2, 64 : memref<32x32xf32> %alloc = memref.alloc() {alignment = 64 : i64} : memref<32x32xf32> %3 = gpu.thread_id x diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transpose_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transpose_pipeline_test.mlir index c204d88f8e54b..09357d92bae10 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transpose_pipeline_test.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/transpose_pipeline_test.mlir @@ -1,8 +1,13 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_80 \ // RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target, fold-memref-alias-ops, canonicalize, cse)))))" %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb"> -#pipeline_layout = #hal.pipeline.layout, <1, storage_buffer>]>]> hal.executable @transpose_dispatch_0 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @transpose_dispatch_0_generic_4096x4096 ordinal(0) layout(#pipeline_layout) { @@ -13,8 +18,8 @@ hal.executable @transpose_dispatch_0 { builtin.module { func.func @transpose_dispatch_0_generic_4096x4096() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x4096xf32> %3 = tensor.empty() : tensor<4096x4096xf32> %4 = linalg.generic {indexing_maps = [ affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<4096x4096xf32>) outs(%3 : tensor<4096x4096xf32>) { @@ -35,9 +40,9 @@ hal.executable @transpose_dispatch_0 { // CHECK-DAG: %[[D1:.*]] = gpu.thread_id y // CHECK-DAG: %[[D2:.*]] = gpu.thread_id z // CHECK-DAG: %[[D3:.*]] = memref.alloc() : memref<32x33xf32, #gpu.address_space> -// CHECK: %[[D4:.*]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref<4096x4096xf32, #hal.descriptor_type> +// CHECK: %[[D4:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%[[C0]]) : memref<4096x4096xf32, #hal.descriptor_type> // CHECK: memref.assume_alignment %[[D4]], 64 : memref<4096x4096xf32, #hal.descriptor_type> -// CHECK: %[[D5:.*]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref<4096x4096xf32, #hal.descriptor_type> +// CHECK: %[[D5:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%[[C0]]) : memref<4096x4096xf32, #hal.descriptor_type> // CHECK: memref.assume_alignment %[[D5]], 64 : memref<4096x4096xf32, #hal.descriptor_type> // CHECK: gpu.barrier // CHECK: %[[D6:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]], %[[D1]], %[[D2]]] @@ -56,9 +61,14 @@ hal.executable @transpose_dispatch_0 { // ----- - +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb"> -#pipeline_layout = #hal.pipeline.layout, <1, storage_buffer>]>]> hal.executable @transpose_single_operand_dispatch_0_generic_768x2048 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @transpose_single_operand_dispatch_0_generic_768x2048 ordinal(0) layout(#pipeline_layout) { @@ -69,9 +79,9 @@ hal.executable @transpose_single_operand_dispatch_0_generic_768x2048 { builtin.module { func.func @transpose_single_operand_dispatch_0_generic_768x2048() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 768], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2048x768xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [768, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<768x2048xf32> %5 = tensor.empty() : tensor<768x2048xf32> @@ -94,11 +104,11 @@ hal.executable @transpose_single_operand_dispatch_0_generic_768x2048 { // CHECK: %[[D1:.*]] = gpu.thread_id y // CHECK: %[[D2:.*]] = gpu.thread_id z // CHECK: %[[D3:.*]] = memref.alloc() : memref<32x33xf32, #gpu.address_space> -// CHECK: %[[D4:.*]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref<2048x768xf32, #hal.descriptor_type> +// CHECK: %[[D4:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%[[C0]]) : memref<2048x768xf32, #hal.descriptor_type> // CHECK: memref.assume_alignment %[[D4]], 64 : memref<2048x768xf32, #hal.descriptor_type> -// CHECK: %[[D5:.*]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref<768x2048xf32, #hal.descriptor_type> +// CHECK: %[[D5:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%[[C0]]) : memref<768x2048xf32, #hal.descriptor_type> // CHECK: memref.assume_alignment %[[D5]], 64 : memref<768x2048xf32, #hal.descriptor_type> -// CHECK: %[[D6:.*]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref<768x2048xf32, #hal.descriptor_type> +// CHECK: %[[D6:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) alignment(64) offset(%[[C0]]) : memref<768x2048xf32, #hal.descriptor_type> // CHECK: memref.assume_alignment %[[D6]], 64 : memref<768x2048xf32, #hal.descriptor_type> // CHECK: gpu.barrier // CHECK: %[[D7:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]], %[[D1]], %[[D2]]] @@ -119,8 +129,14 @@ hal.executable @transpose_single_operand_dispatch_0_generic_768x2048 { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb"> -#pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]> hal.executable @transpose_3d_no_dispatch_0_generic_768x2048x1024 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @transpose_3d_no_dispatch_0_generic_768x2048x1024 ordinal(0) layout(#pipeline_layout) { @@ -131,9 +147,9 @@ hal.executable @transpose_3d_no_dispatch_0_generic_768x2048x1024 { builtin.module { func.func @transpose_3d_no_dispatch_0_generic_768x2048x1024() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2048, 768, 1024], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2048x768x1024xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [768, 2048, 1024], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<768x2048x1024xf32> %5 = tensor.empty() : tensor<768x2048x1024xf32> @@ -156,8 +172,14 @@ hal.executable @transpose_3d_no_dispatch_0_generic_768x2048x1024 { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb"> -#pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]> hal.executable @transpose_3d_yes_dispatch_0_generic_10x768x2048 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @transpose_3d_yes_dispatch_0_generic_10x768x2048 ordinal(0) layout(#pipeline_layout) { @@ -168,9 +190,9 @@ hal.executable @transpose_3d_yes_dispatch_0_generic_10x768x2048 { builtin.module { func.func @transpose_3d_yes_dispatch_0_generic_10x768x2048() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 2048, 768], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<10x2048x768xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 768, 2048], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<10x768x2048xf32> %5 = tensor.empty() : tensor<10x768x2048xf32> @@ -193,11 +215,11 @@ hal.executable @transpose_3d_yes_dispatch_0_generic_10x768x2048 { // CHECK: %[[D1:.*]] = gpu.thread_id y // CHECK: %[[D2:.*]] = gpu.thread_id z // CHECK: %[[D3:.*]] = memref.alloc() : memref<1x32x33xf32, #gpu.address_space> -// CHECK: %[[D4:.*]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref<10x2048x768xf32, #hal.descriptor_type> +// CHECK: %[[D4:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%[[C0]]) : memref<10x2048x768xf32, #hal.descriptor_type> // CHECK: memref.assume_alignment %[[D4]], 64 : memref<10x2048x768xf32, #hal.descriptor_type> -// CHECK: %[[D5:.*]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref<10x768x2048xf32, #hal.descriptor_type> +// CHECK: %[[D5:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%[[C0]]) : memref<10x768x2048xf32, #hal.descriptor_type> // CHECK: memref.assume_alignment %[[D5]], 64 : memref<10x768x2048xf32, #hal.descriptor_type> -// CHECK: %[[D6:.*]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref<10x768x2048xf32, #hal.descriptor_type> +// CHECK: %[[D6:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) alignment(64) offset(%[[C0]]) : memref<10x768x2048xf32, #hal.descriptor_type> // CHECK: memref.assume_alignment %[[D6]], 64 : memref<10x768x2048xf32, #hal.descriptor_type> // CHECK: gpu.barrier // CHECK: %[[D7:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]], %[[D1]], %[[D2]]] @@ -218,8 +240,14 @@ hal.executable @transpose_3d_yes_dispatch_0_generic_10x768x2048 { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb"> -#pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]> hal.executable @transpose_3d_trans_out_dispatch_0_generic_10x2048x768 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @transpose_3d_trans_out_dispatch_0_generic_10x2048x768 ordinal(0) layout(#pipeline_layout) { @@ -230,9 +258,9 @@ hal.executable @transpose_3d_trans_out_dispatch_0_generic_10x2048x768 { builtin.module { func.func @transpose_3d_trans_out_dispatch_0_generic_10x2048x768() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 768, 2048], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<10x768x2048xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [10, 768, 2048], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<10x768x2048xf32> %5 = tensor.empty() : tensor<10x2048x768xf32> @@ -256,11 +284,11 @@ hal.executable @transpose_3d_trans_out_dispatch_0_generic_10x2048x768 { // CHECK: %[[D2:.*]] = gpu.thread_id z // CHECK: %[[D3:.*]] = memref.alloc() : memref<1x32x33xf32, #gpu.address_space> // CHECK: %[[D4:.*]] = memref.alloc() : memref<1x32x33xf32, #gpu.address_space> -// CHECK: %[[D5:.*]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref<10x768x2048xf32, #hal.descriptor_type> +// CHECK: %[[D5:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%[[C0]]) : memref<10x768x2048xf32, #hal.descriptor_type> // CHECK: memref.assume_alignment %[[D5]], 64 : memref<10x768x2048xf32, #hal.descriptor_type> -// CHECK: %[[D6:.*]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref<10x768x2048xf32, #hal.descriptor_type> +// CHECK: %[[D6:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%[[C0]]) : memref<10x768x2048xf32, #hal.descriptor_type> // CHECK: memref.assume_alignment %[[D6]], 64 : memref<10x768x2048xf32, #hal.descriptor_type> -// CHECK: %[[D7:.*]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%[[C0]]) : memref<10x2048x768xf32, #hal.descriptor_type> +// CHECK: %[[D7:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) alignment(64) offset(%[[C0]]) : memref<10x2048x768xf32, #hal.descriptor_type> // CHECK: memref.assume_alignment %[[D7]], 64 : memref<10x2048x768xf32, #hal.descriptor_type> // CHECK: gpu.barrier // CHECK: %[[D8:.*]] = affine.apply #{{.*}}()[%{{.*}}, %[[D0]], %[[D1]], %[[D2]]] @@ -283,8 +311,14 @@ hal.executable @transpose_3d_trans_out_dispatch_0_generic_10x2048x768 { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb"> -#pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]> hal.executable @transpose_3d_diff_dispatch_0_generic_10x768x2048 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { hal.executable.export public @transpose_3d_diff_dispatch_0_generic_10x768x2048 ordinal(0) layout(#pipeline_layout) { @@ -299,9 +333,9 @@ hal.executable @transpose_3d_diff_dispatch_0_generic_10x768x2048 { %c768 = arith.constant 768 : index %c2048 = arith.constant 2048 : index %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ukernel_pipeline_transform.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ukernel_pipeline_transform.mlir index eede18f97d31a..f231c8ba3f727 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ukernel_pipeline_transform.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ukernel_pipeline_transform.mlir @@ -1,41 +1,45 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}> #map = affine_map<(d0) -> (d0)> #map1 = affine_map<(d0) -> ()> -module { - func.func @argmax_1d_f16i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} { - %c32_i64 = arith.constant 32 : i64 - %cst = arith.constant 0xFC00 : f16 - %c0_i64 = arith.constant 0 : i64 - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = arith.extui %0 : i32 to i64 - %3 = arith.extui %1 : i32 to i64 - %4 = arith.shli %3, %c32_i64 : i64 - %5 = arith.ori %2, %4 : i64 - %6 = arith.index_castui %5 : i64 to index - %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %8 = flow.dispatch.workload.ordinal %6, 0 : index - %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%8} - %10 = flow.dispatch.tensor.load %9, offsets = [0], sizes = [%8], strides = [1] : !flow.dispatch.tensor>{%8} -> tensor - %11 = tensor.empty() : tensor - %12 = tensor.empty() : tensor - %13 = linalg.fill ins(%c0_i64 : i64) outs(%11 : tensor) -> tensor - %14 = linalg.fill ins(%cst : f16) outs(%12 : tensor) -> tensor - %15:2 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["reduction"]} ins(%10 : tensor) outs(%14, %13 : tensor, tensor) { - ^bb0(%in: f16, %out: f16, %out_0: i64): - %16 = linalg.index 0 : index - %17 = arith.index_cast %16 : index to i64 - %18 = arith.maximumf %in, %out : f16 - %19 = arith.cmpf ogt, %in, %out : f16 - %20 = arith.select %19, %17, %out_0 : i64 - linalg.yield %18, %20 : f16, i64 - } -> (tensor, tensor) - flow.dispatch.tensor.store %15#1, %7, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> - return - } +func.func @argmax_1d_f16i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} { + %c32_i64 = arith.constant 32 : i64 + %cst = arith.constant 0xFC00 : f16 + %c0_i64 = arith.constant 0 : i64 + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = arith.extui %0 : i32 to i64 + %3 = arith.extui %1 : i32 to i64 + %4 = arith.shli %3, %c32_i64 : i64 + %5 = arith.ori %2, %4 : i64 + %6 = arith.index_castui %5 : i64 to index + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %8 = flow.dispatch.workload.ordinal %6, 0 : index + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%8} + %10 = flow.dispatch.tensor.load %9, offsets = [0], sizes = [%8], strides = [1] : !flow.dispatch.tensor>{%8} -> tensor + %11 = tensor.empty() : tensor + %12 = tensor.empty() : tensor + %13 = linalg.fill ins(%c0_i64 : i64) outs(%11 : tensor) -> tensor + %14 = linalg.fill ins(%cst : f16) outs(%12 : tensor) -> tensor + %15:2 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["reduction"]} ins(%10 : tensor) outs(%14, %13 : tensor, tensor) { + ^bb0(%in: f16, %out: f16, %out_0: i64): + %16 = linalg.index 0 : index + %17 = arith.index_cast %16 : index to i64 + %18 = arith.maximumf %in, %out : f16 + %19 = arith.cmpf ogt, %in, %out : f16 + %20 = arith.select %19, %17, %out_0 : i64 + linalg.yield %18, %20 : f16, i64 + } -> (tensor, tensor) + flow.dispatch.tensor.store %15#1, %7, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> + return } // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info @@ -44,42 +48,47 @@ module { // CHECK: iree_codegen.ukernel.generic "__iree_uk_rocm_argmax_F16I64" // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> -module { - func.func @argmax_2d_f32i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} { - %c32_i64 = arith.constant 32 : i64 - %cst = arith.constant 0xFF800000 : f32 - %c0_i64 = arith.constant 0 : i64 - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = arith.extui %0 : i32 to i64 - %3 = arith.extui %1 : i32 to i64 - %4 = arith.shli %3, %c32_i64 : i64 - %5 = arith.ori %2, %4 : i64 - %6 = arith.index_castui %5 : i64 to index - %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %8 = flow.dispatch.workload.ordinal %6, 0 : index - %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%8} - %10 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [16, %8], strides = [1, 1] : !flow.dispatch.tensor>{%8} -> tensor<16x?xf32> - %11 = tensor.empty() : tensor<16xi64> - %12 = tensor.empty() : tensor<16xf32> - %13 = linalg.fill ins(%c0_i64 : i64) outs(%11 : tensor<16xi64>) -> tensor<16xi64> - %14 = linalg.fill ins(%cst : f32) outs(%12 : tensor<16xf32>) -> tensor<16xf32> - %15:2 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "reduction"]} ins(%10 : tensor<16x?xf32>) outs(%14, %13 : tensor<16xf32>, tensor<16xi64>) { - ^bb0(%in: f32, %out: f32, %out_0: i64): - %16 = linalg.index 1 : index - %17 = arith.index_cast %16 : index to i64 - %18 = arith.maximumf %in, %out : f32 - %19 = arith.cmpf ogt, %in, %out : f32 - %20 = arith.select %19, %17, %out_0 : i64 - linalg.yield %18, %20 : f32, i64 - } -> (tensor<16xf32>, tensor<16xi64>) - flow.dispatch.tensor.store %15#1, %7, offsets = [0], sizes = [16], strides = [1] : tensor<16xi64> -> !flow.dispatch.tensor> - return - } +func.func @argmax_2d_f32i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} { + %c32_i64 = arith.constant 32 : i64 + %cst = arith.constant 0xFF800000 : f32 + %c0_i64 = arith.constant 0 : i64 + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = arith.extui %0 : i32 to i64 + %3 = arith.extui %1 : i32 to i64 + %4 = arith.shli %3, %c32_i64 : i64 + %5 = arith.ori %2, %4 : i64 + %6 = arith.index_castui %5 : i64 to index + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %8 = flow.dispatch.workload.ordinal %6, 0 : index + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%8} + %10 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [16, %8], strides = [1, 1] : !flow.dispatch.tensor>{%8} -> tensor<16x?xf32> + %11 = tensor.empty() : tensor<16xi64> + %12 = tensor.empty() : tensor<16xf32> + %13 = linalg.fill ins(%c0_i64 : i64) outs(%11 : tensor<16xi64>) -> tensor<16xi64> + %14 = linalg.fill ins(%cst : f32) outs(%12 : tensor<16xf32>) -> tensor<16xf32> + %15:2 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "reduction"]} ins(%10 : tensor<16x?xf32>) outs(%14, %13 : tensor<16xf32>, tensor<16xi64>) { + ^bb0(%in: f32, %out: f32, %out_0: i64): + %16 = linalg.index 1 : index + %17 = arith.index_cast %16 : index to i64 + %18 = arith.maximumf %in, %out : f32 + %19 = arith.cmpf ogt, %in, %out : f32 + %20 = arith.select %19, %17, %out_0 : i64 + linalg.yield %18, %20 : f32, i64 + } -> (tensor<16xf32>, tensor<16xi64>) + flow.dispatch.tensor.store %15#1, %7, offsets = [0], sizes = [16], strides = [1] : tensor<16xi64> -> !flow.dispatch.tensor> + return } // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info @@ -91,42 +100,46 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb"> #map = affine_map<(d0) -> (d0)> #map1 = affine_map<(d0) -> ()> -module { - func.func @no_ukernel_argmax_1d_f16i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} { - %c32_i64 = arith.constant 32 : i64 - %cst = arith.constant 0xFC00 : f16 - %c0_i64 = arith.constant 0 : i64 - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = arith.extui %0 : i32 to i64 - %3 = arith.extui %1 : i32 to i64 - %4 = arith.shli %3, %c32_i64 : i64 - %5 = arith.ori %2, %4 : i64 - %6 = arith.index_castui %5 : i64 to index - %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %8 = flow.dispatch.workload.ordinal %6, 0 : index - %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%8} - %10 = flow.dispatch.tensor.load %9, offsets = [0], sizes = [%8], strides = [1] : !flow.dispatch.tensor>{%8} -> tensor - %11 = tensor.empty() : tensor - %12 = tensor.empty() : tensor - %13 = linalg.fill ins(%c0_i64 : i64) outs(%11 : tensor) -> tensor - %14 = linalg.fill ins(%cst : f16) outs(%12 : tensor) -> tensor - %15:2 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["reduction"]} ins(%10 : tensor) outs(%14, %13 : tensor, tensor) { - ^bb0(%in: f16, %out: f16, %out_0: i64): - %16 = linalg.index 0 : index - %17 = arith.index_cast %16 : index to i64 - %18 = arith.maximumf %in, %out : f16 - %19 = arith.cmpf ogt, %in, %out : f16 - %20 = arith.select %19, %17, %out_0 : i64 - linalg.yield %18, %20 : f16, i64 - } -> (tensor, tensor) - flow.dispatch.tensor.store %15#1, %7, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> - return - } +func.func @no_ukernel_argmax_1d_f16i64() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} { + %c32_i64 = arith.constant 32 : i64 + %cst = arith.constant 0xFC00 : f16 + %c0_i64 = arith.constant 0 : i64 + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = arith.extui %0 : i32 to i64 + %3 = arith.extui %1 : i32 to i64 + %4 = arith.shli %3, %c32_i64 : i64 + %5 = arith.ori %2, %4 : i64 + %6 = arith.index_castui %5 : i64 to index + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %8 = flow.dispatch.workload.ordinal %6, 0 : index + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%8} + %10 = flow.dispatch.tensor.load %9, offsets = [0], sizes = [%8], strides = [1] : !flow.dispatch.tensor>{%8} -> tensor + %11 = tensor.empty() : tensor + %12 = tensor.empty() : tensor + %13 = linalg.fill ins(%c0_i64 : i64) outs(%11 : tensor) -> tensor + %14 = linalg.fill ins(%cst : f16) outs(%12 : tensor) -> tensor + %15:2 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["reduction"]} ins(%10 : tensor) outs(%14, %13 : tensor, tensor) { + ^bb0(%in: f16, %out: f16, %out_0: i64): + %16 = linalg.index 0 : index + %17 = arith.index_cast %16 : index to i64 + %18 = arith.maximumf %in, %out : f16 + %19 = arith.cmpf ogt, %in, %out : f16 + %20 = arith.select %19, %17, %out_0 : i64 + linalg.yield %18, %20 : f16, i64 + } -> (tensor, tensor) + flow.dispatch.tensor.store %15#1, %7, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> + return } // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info @@ -136,42 +149,46 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "argmax"}> #map = affine_map<(d0) -> (d0)> #map1 = affine_map<(d0) -> ()> -module { - func.func @not_neg_inf_init_argmax_1d() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} { - %c32_i64 = arith.constant 32 : i64 - %cst = arith.constant 0.000000e+00 : f16 - %c0_i64 = arith.constant 0 : i64 - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = arith.extui %0 : i32 to i64 - %3 = arith.extui %1 : i32 to i64 - %4 = arith.shli %3, %c32_i64 : i64 - %5 = arith.ori %2, %4 : i64 - %6 = arith.index_castui %5 : i64 to index - %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %8 = flow.dispatch.workload.ordinal %6, 0 : index - %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%8} - %10 = flow.dispatch.tensor.load %9, offsets = [0], sizes = [%8], strides = [1] : !flow.dispatch.tensor>{%8} -> tensor - %11 = tensor.empty() : tensor - %12 = tensor.empty() : tensor - %13 = linalg.fill ins(%c0_i64 : i64) outs(%11 : tensor) -> tensor - %14 = linalg.fill ins(%cst : f16) outs(%12 : tensor) -> tensor - %15:2 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["reduction"]} ins(%10 : tensor) outs(%14, %13 : tensor, tensor) { - ^bb0(%in: f16, %out: f16, %out_0: i64): - %16 = linalg.index 0 : index - %17 = arith.index_cast %16 : index to i64 - %18 = arith.maximumf %in, %out : f16 - %19 = arith.cmpf ogt, %in, %out : f16 - %20 = arith.select %19, %17, %out_0 : i64 - linalg.yield %18, %20 : f16, i64 - } -> (tensor, tensor) - flow.dispatch.tensor.store %15#1, %7, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> - return - } +func.func @not_neg_inf_init_argmax_1d() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} { + %c32_i64 = arith.constant 32 : i64 + %cst = arith.constant 0.000000e+00 : f16 + %c0_i64 = arith.constant 0 : i64 + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = arith.extui %0 : i32 to i64 + %3 = arith.extui %1 : i32 to i64 + %4 = arith.shli %3, %c32_i64 : i64 + %5 = arith.ori %2, %4 : i64 + %6 = arith.index_castui %5 : i64 to index + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %8 = flow.dispatch.workload.ordinal %6, 0 : index + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%8} + %10 = flow.dispatch.tensor.load %9, offsets = [0], sizes = [%8], strides = [1] : !flow.dispatch.tensor>{%8} -> tensor + %11 = tensor.empty() : tensor + %12 = tensor.empty() : tensor + %13 = linalg.fill ins(%c0_i64 : i64) outs(%11 : tensor) -> tensor + %14 = linalg.fill ins(%cst : f16) outs(%12 : tensor) -> tensor + %15:2 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["reduction"]} ins(%10 : tensor) outs(%14, %13 : tensor, tensor) { + ^bb0(%in: f16, %out: f16, %out_0: i64): + %16 = linalg.index 0 : index + %17 = arith.index_cast %16 : index to i64 + %18 = arith.maximumf %in, %out : f16 + %19 = arith.cmpf ogt, %in, %out : f16 + %20 = arith.select %19, %17, %out_0 : i64 + linalg.yield %18, %20 : f16, i64 + } -> (tensor, tensor) + flow.dispatch.tensor.store %15#1, %7, offsets = [], sizes = [], strides = [] : tensor -> !flow.dispatch.tensor> + return } // CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/winograd_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/winograd_pipeline_test.mlir index f4241b0c4716e..fcb1192034215 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/winograd_pipeline_test.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/winograd_pipeline_test.mlir @@ -1,16 +1,20 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target))" %s | FileCheck %s -module { - func.func @winograd_filter_transform() { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [3, 3, 64, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x64x128xf32> - %3 = tensor.empty() : tensor<8x8x64x128xf32> - %4 = iree_linalg_ext.winograd.filter_transform output_tile_size(6) kernel_size(3) kernel_dimensions([0, 1]) ins(%2 : tensor<3x3x64x128xf32>) outs(%3 : tensor<8x8x64x128xf32>) -> tensor<8x8x64x128xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [8, 8, 64, 128], strides = [1, 1, 1, 1] : tensor<8x8x64x128xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @winograd_filter_transform() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [3, 3, 64, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x64x128xf32> + %3 = tensor.empty() : tensor<8x8x64x128xf32> + %4 = iree_linalg_ext.winograd.filter_transform output_tile_size(6) kernel_size(3) kernel_dimensions([0, 1]) ins(%2 : tensor<3x3x64x128xf32>) outs(%3 : tensor<8x8x64x128xf32>) -> tensor<8x8x64x128xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [8, 8, 64, 128], strides = [1, 1, 1, 1] : tensor<8x8x64x128xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @winograd_filter_transform // CHECK-NOT: memref.alloc @@ -25,17 +29,21 @@ module { // ----- -module { - func.func @winograd_input_transform() { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 34, 34, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x34x34x128xf16> - %3 = tensor.empty() : tensor<8x8x2x6x6x128xf16> - %4 = iree_linalg_ext.winograd.input_transform output_tile_size(6) kernel_size(3) image_dimensions([1, 2]) ins(%2 : tensor<2x34x34x128xf16>) outs(%3 : tensor<8x8x2x6x6x128xf16>) -> tensor<8x8x2x6x6x128xf16> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 8, 2, 6, 6, 128], strides = [1, 1, 1, 1, 1, 1] : tensor<8x8x2x6x6x128xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @winograd_input_transform() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 34, 34, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x34x34x128xf16> + %3 = tensor.empty() : tensor<8x8x2x6x6x128xf16> + %4 = iree_linalg_ext.winograd.input_transform output_tile_size(6) kernel_size(3) image_dimensions([1, 2]) ins(%2 : tensor<2x34x34x128xf16>) outs(%3 : tensor<8x8x2x6x6x128xf16>) -> tensor<8x8x2x6x6x128xf16> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 8, 2, 6, 6, 128], strides = [1, 1, 1, 1, 1, 1] : tensor<8x8x2x6x6x128xf16> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @winograd_input_transform // CHECK-NOT: memref.alloc @@ -51,17 +59,21 @@ module { // ----- -module { - func.func @winograd_output_transform() { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 8, 2, 6, 6, 128], strides = [1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x8x2x6x6x128xf16> - %3 = tensor.empty() : tensor<2x36x36x128xf16> - %4 = iree_linalg_ext.winograd.output_transform output_tile_size(6) kernel_size(3) image_dimensions([1, 2]) ins(%2 : tensor<8x8x2x6x6x128xf16>) outs(%3 : tensor<2x36x36x128xf16>) -> tensor<2x36x36x128xf16> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [2, 36, 36, 128], strides = [1, 1, 1, 1] : tensor<2x36x36x128xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @winograd_output_transform() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 8, 2, 6, 6, 128], strides = [1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x8x2x6x6x128xf16> + %3 = tensor.empty() : tensor<2x36x36x128xf16> + %4 = iree_linalg_ext.winograd.output_transform output_tile_size(6) kernel_size(3) image_dimensions([1, 2]) ins(%2 : tensor<8x8x2x6x6x128xf16>) outs(%3 : tensor<2x36x36x128xf16>) -> tensor<2x36x36x128xf16> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [2, 36, 36, 128], strides = [1, 1, 1, 1] : tensor<2x36x36x128xf16> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @winograd_output_transform // CHECK-NOT: memref.alloc diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir index f8b68f108dbef..73ffa190ec990 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/workgroup_specialization_pipeline_test.mlir @@ -1,8 +1,15 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_80 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> hal.executable private @forward_dispatch_116 { hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb">) { - hal.executable.export public @forward_dispatch_116_matmul_128x30522x768 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) { + hal.executable.export public @forward_dispatch_116_matmul_128x30522x768 ordinal(0) layout(#pipeline_layout) { ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index): %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2 hal.return %x, %y, %z : index, index, index @@ -14,10 +21,10 @@ hal.executable private @forward_dispatch_116 { %c265458176 = arith.constant 265458176 : index %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c512) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c786944) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c265458176) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c512) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c786944) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c265458176) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 768], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x768xf32> %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [768, 30522], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<768x30522xf32> %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [30522], strides = [1] : !flow.dispatch.tensor> -> tensor<30522xf32> @@ -53,8 +60,14 @@ hal.executable private @forward_dispatch_116 { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0) -> (d0)> -#pipeline_layout = #hal.pipeline.layout, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]> #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb"> hal.executable private @vectorized_dispatch_0 { hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) { @@ -67,9 +80,9 @@ hal.executable private @vectorized_dispatch_0 { func.func @vectorized_dispatch_0_generic_102401() { %c0 = arith.constant 0 : index %cst = arith.constant -3.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [102401], strides = [1] : !flow.dispatch.tensor> -> tensor<102401xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [102401], strides = [1] : !flow.dispatch.tensor> -> tensor<102401xf32> %5 = tensor.empty() : tensor<102401xf32> @@ -93,8 +106,8 @@ hal.executable private @vectorized_dispatch_0 { // CHECK: %[[BLKX2:.*]] = affine.min #{{.+}}()[%[[BLKX]]] // CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[BLKX2]], %[[c256]] : index // CHECK: scf.if %[[CMP]] -// CHECK: %[[ARR:.*]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%[[c0]]) : memref<102401xf32, #hal.descriptor_type> -// CHECK: %[[ARR2:.*]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%[[c0]]) : memref<102401xf32, #hal.descriptor_type> +// CHECK: %[[ARR:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%[[c0]]) : memref<102401xf32, #hal.descriptor_type> +// CHECK: %[[ARR2:.*]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%[[c0]]) : memref<102401xf32, #hal.descriptor_type> // CHECK: %[[TIDX:.*]] = gpu.thread_id x // CHECK: %[[AFF:.*]] = affine.apply #{{.+}}(%[[TIDX]])[%[[BLKX]]] // CHECK: vector.transfer_read %[[ARR]][%[[AFF]]], %[[cst]] {in_bounds = [true]} : memref<102401xf32, #hal.descriptor_type>, vector<4xf32> diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVEraseStorageBufferStaticShape.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVEraseStorageBufferStaticShape.cpp index d4f36fe6acd56..f9c91e9e14991 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVEraseStorageBufferStaticShape.cpp +++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVEraseStorageBufferStaticShape.cpp @@ -48,14 +48,16 @@ bool is1DStaticShapedStorageBuffer( /// e.g., /// /// ```mlir -/// hal.interface.binding.subspan set(0) binding(0) offset(%offset) +/// hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) +/// offset(%offset) /// : memref<16xf32> /// ``` /// /// is re-written to /// /// ```mlir -/// hal.interface.binding.subspan set(0) binding(0) offset(%offset) +/// hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) +/// offset(%offset) /// : memref{%c16} /// ``` IREE::HAL::InterfaceBindingSubspanOp diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/annotate_winograd_loops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/annotate_winograd_loops.mlir index d2c04e8653a3b..7796ffc2f8faa 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/annotate_winograd_loops.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/annotate_winograd_loops.mlir @@ -1,5 +1,11 @@ // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-spirv-annotate-winograd-loops))" %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @_wino_input_dispatch_0() { %c0 = arith.constant 0 : index %c1280 = arith.constant 1280 : index @@ -10,8 +16,8 @@ func.func @_wino_input_dispatch_0() { %c1 = arith.constant 1 : index %c32 = arith.constant 32 : index %0 = tensor.empty() : tensor<8x8xf32> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index @@ -62,9 +68,9 @@ func.func @_wino_input_dispatch_0() { // CHECK: %[[C1:.+]] = arith.constant 1 : index // CHECK: %[[C32:.+]] = arith.constant 32 : index // CHECK: %[[D0:.+]] = tensor.empty() : tensor<8x8xf32> -// CHECK: %[[D1:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%[[C0]]) +// CHECK: %[[D1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%[[C0]]) // CHECK-SAME: : !flow.dispatch.tensor> -// CHECK: %[[D2:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%[[C0]]) +// CHECK: %[[D2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%[[C0]]) // CHECK-SAME: : !flow.dispatch.tensor> // CHECK: %[[WORKGROUP_ID_X:.+]] = hal.interface.workgroup.id[0] : index // CHECK: %[[WORKGROUP_COUNT_X:.+]] = hal.interface.workgroup.count[0] : index diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_adreno_conv.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_adreno_conv.mlir index fe7f86bec5689..4e6ea89053601 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_adreno_conv.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_adreno_conv.mlir @@ -2,23 +2,28 @@ // Conv - large OC - distribute to only one workgroup dimension. -module { - func.func @conv_112x112x512() { - %c0 = arith.constant 0 : index - %c512 = arith.constant 512 : index - %c112 = arith.constant 112 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x225x225x3xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 512], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x3x512xf32> - %5 = tensor.empty() : tensor<1x112x112x512xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32> - %7 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x512xf32>) outs(%6 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 512], strides = [1, 1, 1, 1] : tensor<1x112x112x512xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @conv_112x112x512() { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c112 = arith.constant 112 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x225x225x3xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 512], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x3x512xf32> + %5 = tensor.empty() : tensor<1x112x112x512xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32> + %7 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x512xf32>) outs(%6 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 512], strides = [1, 1, 1, 1] : tensor<1x112x112x512xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -32,23 +37,28 @@ module { // Conv - medium OC/OW/OH - distribute to two workgroup dimensions. -module { - func.func @conv_112x112x32() { - %c0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c112 = arith.constant 112 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x225x225x3xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x3x32xf32> - %5 = tensor.empty() : tensor<1x112x112x32xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> - %7 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%6 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : tensor<1x112x112x32xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @conv_112x112x32() { + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %c112 = arith.constant 112 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x225x225x3xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x3x32xf32> + %5 = tensor.empty() : tensor<1x112x112x32xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> + %7 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%6 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : tensor<1x112x112x32xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -62,22 +72,27 @@ module { // Conv - small OC/OW/OH - distribute to all three workgroup dimensions. -module { - func.func @conv_16x16x16() { - %c0 = arith.constant 0 : index - %c16 = arith.constant 16 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 33, 33, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x33x33x3xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x3x16xf32> - %5 = tensor.empty() : tensor<1x16x16x16xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32> - %7 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x33x33x3xf32>, tensor<3x3x3x16xf32>) outs(%6 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 16, 16, 16], strides = [1, 1, 1, 1] : tensor<1x16x16x16xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @conv_16x16x16() { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 33, 33, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x33x33x3xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x3x16xf32> + %5 = tensor.empty() : tensor<1x16x16x16xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32> + %7 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x33x33x3xf32>, tensor<3x3x3x16xf32>) outs(%6 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 16, 16, 16], strides = [1, 1, 1, 1] : tensor<1x16x16x16xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -90,23 +105,28 @@ module { // Depthwise conv - small OC/OW/OH - distribute to all three workgroup dimensions. -module { - func.func @dwconv_28x28x144() { - %c0 = arith.constant 0 : index - %c144 = arith.constant 144 : index - %c28 = arith.constant 28 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 57, 57, 144], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x57x57x144xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 144], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x144xf32> - %5 = tensor.empty() : tensor<1x28x28x144xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32> - %7 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x57x57x144xf32>, tensor<3x3x144xf32>) outs(%6 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 144], strides = [1, 1, 1, 1] : tensor<1x28x28x144xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @dwconv_28x28x144() { + %c0 = arith.constant 0 : index + %c144 = arith.constant 144 : index + %c28 = arith.constant 28 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 57, 57, 144], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x57x57x144xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 144], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x144xf32> + %5 = tensor.empty() : tensor<1x28x28x144xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32> + %7 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x57x57x144xf32>, tensor<3x3x144xf32>) outs(%6 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 144], strides = [1, 1, 1, 1] : tensor<1x28x28x144xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -120,24 +140,30 @@ module { // Depthwise conv - tiny OC/OW/OH - starving the GPU. -module { - func.func @dwconv_4x4x8() { - %c0 = arith.constant 0 : index - %c8 = arith.constant 8 : index - %c4 = arith.constant 4 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 9, 9, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x9x9x8xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 8], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x8xf32> - %5 = tensor.empty() : tensor<1x4x4x8xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32> - %7 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x9x9x8xf32>, tensor<3x3x8xf32>) outs(%6 : tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 4, 4, 8], strides = [1, 1, 1, 1] : tensor<1x4x4x8xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @dwconv_4x4x8() { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c4 = arith.constant 4 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 9, 9, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x9x9x8xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 8], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x8xf32> + %5 = tensor.empty() : tensor<1x4x4x8xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32> + %7 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x9x9x8xf32>, tensor<3x3x8xf32>) outs(%6 : tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 4, 4, 8], strides = [1, 1, 1, 1] : tensor<1x4x4x8xf32> -> !flow.dispatch.tensor> + return } + // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info // CHECK: func.func @dwconv_4x4x8() diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir index ca83457b13c4f..60a9e65d931dc 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_adreno_matmul.mlir @@ -2,24 +2,30 @@ // Large matmul that can match the best tiling scheme. -module { - func.func @matmul_1024x2048x512() { - %c0 = arith.constant 0 : index - %c2048 = arith.constant 2048 : index - %c1024 = arith.constant 1024 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x512xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x2048xf32> - %5 = tensor.empty() : tensor<1024x2048xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32> - %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<1024x512xf32>, tensor<512x2048xf32>) outs(%6 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 2048], strides = [1, 1] : tensor<1024x2048xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_1024x2048x512() { + %c0 = arith.constant 0 : index + %c2048 = arith.constant 2048 : index + %c1024 = arith.constant 1024 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x512xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x2048xf32> + %5 = tensor.empty() : tensor<1024x2048xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32> + %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<1024x512xf32>, tensor<512x2048xf32>) outs(%6 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 2048], strides = [1, 1] : tensor<1024x2048xf32> -> !flow.dispatch.tensor> + return } + // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info // CHECK: func.func @matmul_1024x2048x512() @@ -30,23 +36,29 @@ module { // ----- // Small matmul N that can still tile to all threads in a workgroup. -module { - func.func @matmul_3136x24x96() { - %c0 = arith.constant 0 : index - %c24 = arith.constant 24 : index - %c3136 = arith.constant 3136 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [3136, 96], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<3136x96xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [96, 24], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<96x24xf32> - %5 = tensor.empty() : tensor<3136x24xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<3136x24xf32>) -> tensor<3136x24xf32> - %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<3136x96xf32>, tensor<96x24xf32>) outs(%6 : tensor<3136x24xf32>) -> tensor<3136x24xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [3136, 24], strides = [1, 1] : tensor<3136x24xf32> -> !flow.dispatch.tensor> - return - } + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_3136x24x96() { + %c0 = arith.constant 0 : index + %c24 = arith.constant 24 : index + %c3136 = arith.constant 3136 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [3136, 96], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<3136x96xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [96, 24], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<96x24xf32> + %5 = tensor.empty() : tensor<3136x24xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<3136x24xf32>) -> tensor<3136x24xf32> + %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<3136x96xf32>, tensor<96x24xf32>) outs(%6 : tensor<3136x24xf32>) -> tensor<3136x24xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [3136, 24], strides = [1, 1] : tensor<3136x24xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -59,23 +71,29 @@ module { // ----- // Small matmul M that can still tile to all threads in a workgroup. -module { - func.func @matmul_196x64x192() { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c196 = arith.constant 196 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [196, 192], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<196x192xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [192, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<192x64xf32> - %5 = tensor.empty() : tensor<196x64xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<196x64xf32>) -> tensor<196x64xf32> - %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<196x192xf32>, tensor<192x64xf32>) outs(%6 : tensor<196x64xf32>) -> tensor<196x64xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [196, 64], strides = [1, 1] : tensor<196x64xf32> -> !flow.dispatch.tensor> - return - } + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_196x64x192() { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c196 = arith.constant 196 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [196, 192], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<196x192xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [192, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<192x64xf32> + %5 = tensor.empty() : tensor<196x64xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<196x64xf32>) -> tensor<196x64xf32> + %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<196x192xf32>, tensor<192x64xf32>) outs(%6 : tensor<196x64xf32>) -> tensor<196x64xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [196, 64], strides = [1, 1] : tensor<196x64xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -89,19 +107,24 @@ module { // Small matmul K that can still tile to all threads in a workgroup. -module { - func.func @matmul_12544x96x16() { - %c0 = arith.constant 0 : index - %c96 = arith.constant 96 : index - %c12544 = arith.constant 12544 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<12544x16xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x96xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<12544x96xf32> - linalg.fill ins(%cst : f32) outs(%2 : memref<12544x96xf32>) - linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%0, %1 : memref<12544x16xf32>, memref<16x96xf32>) outs(%2 : memref<12544x96xf32>) - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_12544x96x16() { + %c0 = arith.constant 0 : index + %c96 = arith.constant 96 : index + %c12544 = arith.constant 12544 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<12544x16xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<16x96xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<12544x96xf32> + linalg.fill ins(%cst : f32) outs(%2 : memref<12544x96xf32>) + linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%0, %1 : memref<12544x16xf32>, memref<16x96xf32>) outs(%2 : memref<12544x96xf32>) + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -115,23 +138,28 @@ module { // Odd matmul M and small N that cannot utilize all threads in a workgroup. -module { - func.func @matmul_49x160x576() { - %c0 = arith.constant 0 : index - %c160 = arith.constant 160 : index - %c49 = arith.constant 49 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [49, 576], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<49x576xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [576, 160], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<576x160xf32> - %5 = tensor.empty() : tensor<49x160xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<49x160xf32>) -> tensor<49x160xf32> - %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<49x576xf32>, tensor<576x160xf32>) outs(%6 : tensor<49x160xf32>) -> tensor<49x160xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [49, 160], strides = [1, 1] : tensor<49x160xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_49x160x576() { + %c0 = arith.constant 0 : index + %c160 = arith.constant 160 : index + %c49 = arith.constant 49 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [49, 576], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<49x576xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [576, 160], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<576x160xf32> + %5 = tensor.empty() : tensor<49x160xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<49x160xf32>) -> tensor<49x160xf32> + %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<49x576xf32>, tensor<576x160xf32>) outs(%6 : tensor<49x160xf32>) -> tensor<49x160xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [49, 160], strides = [1, 1] : tensor<49x160xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -145,23 +173,28 @@ module { // Large batch matmul. -module { - func.func @batch_matmul_4x384x384() { - %c0 = arith.constant 0 : index - %c384 = arith.constant 384 : index - %c4 = arith.constant 4 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 384, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x384x32xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 384], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x32x384xf32> - %5 = tensor.empty() : tensor<4x384x384xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32> - %7 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<4x384x32xf32>, tensor<4x32x384xf32>) outs(%6 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [4, 384, 384], strides = [1, 1, 1] : tensor<4x384x384xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @batch_matmul_4x384x384() { + %c0 = arith.constant 0 : index + %c384 = arith.constant 384 : index + %c4 = arith.constant 4 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 384, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x384x32xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 384], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x32x384xf32> + %5 = tensor.empty() : tensor<4x384x384xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32> + %7 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<4x384x32xf32>, tensor<4x32x384xf32>) outs(%6 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [4, 384, 384], strides = [1, 1, 1] : tensor<4x384x384xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -175,23 +208,28 @@ module { // Small batch matmul. -module { - func.func @batch_matmul_4x8x8() { - %c0 = arith.constant 0 : index - %c8 = arith.constant 8 : index - %c4 = arith.constant 4 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 8, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x8x32xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 8], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x32x8xf32> - %5 = tensor.empty() : tensor<4x8x8xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<4x8x8xf32>) -> tensor<4x8x8xf32> - %7 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<4x8x32xf32>, tensor<4x32x8xf32>) outs(%6 : tensor<4x8x8xf32>) -> tensor<4x8x8xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [4, 8, 8], strides = [1, 1, 1] : tensor<4x8x8xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @batch_matmul_4x8x8() { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c4 = arith.constant 4 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 8, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x8x32xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 8], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x32x8xf32> + %5 = tensor.empty() : tensor<4x8x8xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<4x8x8xf32>) -> tensor<4x8x8xf32> + %7 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<4x8x32xf32>, tensor<4x32x8xf32>) outs(%6 : tensor<4x8x8xf32>) -> tensor<4x8x8xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [4, 8, 8], strides = [1, 1, 1] : tensor<4x8x8xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_conv.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_conv.mlir index 25dcfa0575958..6f408be70f1b5 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_conv.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_conv.mlir @@ -1,28 +1,34 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=rdna2@vulkan --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -module { - func.func @nhwc_conv_pointwise_2x64x64x320() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 66, 66, 320], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x66x66x320xf16> - %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 320, 320], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x320x320xf16> - %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [2, 64, 64, 320], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x64x64x320xf16> - %7 = tensor.empty() : tensor<2x64x64x320xf16> - %8 = linalg.fill ins(%cst : f16) outs(%7 : tensor<2x64x64x320xf16>) -> tensor<2x64x64x320xf16> - %9 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%4, %5 : tensor<2x66x66x320xf16>, tensor<3x3x320x320xf16>) outs(%8 : tensor<2x64x64x320xf16>) -> tensor<2x64x64x320xf16> - %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %6 : tensor<2x64x64x320xf16>, tensor<2x64x64x320xf16>) outs(%7 : tensor<2x64x64x320xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %11 = arith.divf %in, %in_0 : f16 - linalg.yield %11 : f16 - } -> tensor<2x64x64x320xf16> - flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0, 0], sizes = [2, 64, 64, 320], strides = [1, 1, 1, 1] : tensor<2x64x64x320xf16> -> !flow.dispatch.tensor> - return - } +func.func @nhwc_conv_pointwise_2x64x64x320() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 66, 66, 320], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x66x66x320xf16> + %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 320, 320], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x320x320xf16> + %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [2, 64, 64, 320], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x64x64x320xf16> + %7 = tensor.empty() : tensor<2x64x64x320xf16> + %8 = linalg.fill ins(%cst : f16) outs(%7 : tensor<2x64x64x320xf16>) -> tensor<2x64x64x320xf16> + %9 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%4, %5 : tensor<2x66x66x320xf16>, tensor<3x3x320x320xf16>) outs(%8 : tensor<2x64x64x320xf16>) -> tensor<2x64x64x320xf16> + %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %6 : tensor<2x64x64x320xf16>, tensor<2x64x64x320xf16>) outs(%7 : tensor<2x64x64x320xf16>) { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %11 = arith.divf %in, %in_0 : f16 + linalg.yield %11 : f16 + } -> tensor<2x64x64x320xf16> + flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0, 0], sizes = [2, 64, 64, 320], strides = [1, 1, 1, 1] : tensor<2x64x64x320xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul.mlir index 77054228433e3..c0d891b554244 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul.mlir @@ -1,18 +1,23 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=rdna2@vulkan --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s -module { - func.func @batch_matmul_f32_16x4096x40x4096() { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 4096, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x4096x4096xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 4096, 40], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x4096x40xf32> - %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [16, 4096, 40], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x4096x40xf32> - %6 = linalg.batch_matmul ins(%3, %4 : tensor<16x4096x4096xf32>, tensor<16x4096x40xf32>) outs(%5 : tensor<16x4096x40xf32>) -> tensor<16x4096x40xf32> - flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0], sizes = [16, 4096, 40], strides = [1, 1, 1] : tensor<16x4096x40xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @batch_matmul_f32_16x4096x40x4096() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 4096, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x4096x4096xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 4096, 40], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x4096x40xf32> + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [16, 4096, 40], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x4096x40xf32> + %6 = linalg.batch_matmul ins(%3, %4 : tensor<16x4096x4096xf32>, tensor<16x4096x40xf32>) outs(%5 : tensor<16x4096x40xf32>) -> tensor<16x4096x40xf32> + flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0], sizes = [16, 4096, 40], strides = [1, 1, 1] : tensor<16x4096x40xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -25,21 +30,26 @@ module { // ----- -module { - func.func @matmul_f16_64x640x320() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 320], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x320xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [320, 640], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<320x640xf16> - %5 = tensor.empty() : tensor<64x640xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x640xf16>) -> tensor<64x640xf16> - %7 = linalg.matmul ins(%3, %4 : tensor<64x320xf16>, tensor<320x640xf16>) outs(%6 : tensor<64x640xf16>) -> tensor<64x640xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 640], strides = [1, 1] : tensor<64x640xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_f16_64x640x320() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 320], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x320xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [320, 640], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<320x640xf16> + %5 = tensor.empty() : tensor<64x640xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x640xf16>) -> tensor<64x640xf16> + %7 = linalg.matmul ins(%3, %4 : tensor<64x320xf16>, tensor<320x640xf16>) outs(%6 : tensor<64x640xf16>) -> tensor<64x640xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 640], strides = [1, 1] : tensor<64x640xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -51,21 +61,25 @@ module { // ----- -module { - func.func @batch_matmul_f32_16x4096x40x4096() { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 4096, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x4096x4096xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 4096, 48], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x4096x48xf32> - %5 = tensor.empty() : tensor<16x4096x48xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<16x4096x48xf32>) -> tensor<16x4096x48xf32> - %7 = linalg.batch_matmul ins(%3, %4 : tensor<16x4096x4096xf32>, tensor<16x4096x48xf32>) outs(%6 : tensor<16x4096x48xf32>) -> tensor<16x4096x48xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [16, 4096, 48], strides = [1, 1, 1] : tensor<16x4096x48xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @batch_matmul_f32_16x4096x40x4096() { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 4096, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x4096x4096xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 4096, 48], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x4096x48xf32> + %5 = tensor.empty() : tensor<16x4096x48xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<16x4096x48xf32>) -> tensor<16x4096x48xf32> + %7 = linalg.batch_matmul ins(%3, %4 : tensor<16x4096x4096xf32>, tensor<16x4096x48xf32>) outs(%6 : tensor<16x4096x48xf32>) -> tensor<16x4096x48xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [16, 4096, 48], strides = [1, 1, 1] : tensor<16x4096x48xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -77,28 +91,33 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> -module { - func.func @batch_matmul_f16_1x4096x4096x512() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 4096, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x4096x512xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [1, 512, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x512x4096xf16> - %5 = tensor.empty() : tensor<1x4096x4096xf32> - %6 = tensor.empty() : tensor<1x4096x4096xf16> - %7 = linalg.fill ins(%cst : f16) outs(%6 : tensor<1x4096x4096xf16>) -> tensor<1x4096x4096xf16> - %8 = linalg.batch_matmul ins(%3, %4 : tensor<1x4096x512xf16>, tensor<1x512x4096xf16>) outs(%7 : tensor<1x4096x4096xf16>) -> tensor<1x4096x4096xf16> - %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8 : tensor<1x4096x4096xf16>) outs(%5 : tensor<1x4096x4096xf32>) { - ^bb0(%in: f16, %out: f32): - %10 = arith.extf %in : f16 to f32 - linalg.yield %10 : f32 - } -> tensor<1x4096x4096xf32> - flow.dispatch.tensor.store %9, %2, offsets = [0, 0, 0], sizes = [1, 4096, 4096], strides = [1, 1, 1] : tensor<1x4096x4096xf32> -> !flow.dispatch.tensor> - return - } +func.func @batch_matmul_f16_1x4096x4096x512() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 4096, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x4096x512xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [1, 512, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x512x4096xf16> + %5 = tensor.empty() : tensor<1x4096x4096xf32> + %6 = tensor.empty() : tensor<1x4096x4096xf16> + %7 = linalg.fill ins(%cst : f16) outs(%6 : tensor<1x4096x4096xf16>) -> tensor<1x4096x4096xf16> + %8 = linalg.batch_matmul ins(%3, %4 : tensor<1x4096x512xf16>, tensor<1x512x4096xf16>) outs(%7 : tensor<1x4096x4096xf16>) -> tensor<1x4096x4096xf16> + %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%8 : tensor<1x4096x4096xf16>) outs(%5 : tensor<1x4096x4096xf32>) { + ^bb0(%in: f16, %out: f32): + %10 = arith.extf %in : f16 to f32 + linalg.yield %10 : f32 + } -> tensor<1x4096x4096xf32> + flow.dispatch.tensor.store %9, %2, offsets = [0, 0, 0], sizes = [1, 4096, 4096], strides = [1, 1, 1] : tensor<1x4096x4096xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -110,53 +129,59 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)> #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)> #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> -module { - func.func @matmul_multi_reduce_i4xf32xf32() { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = hal.interface.constant.load[4] : i32 - %5 = arith.index_castui %0 : i32 to index - %6 = arith.index_castui %1 : i32 to index - %7 = arith.index_castui %2 : i32 to index - %8 = arith.index_castui %3 : i32 to index - %9 = arith.index_castui %4 : i32 to index - %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor> - %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor> - %12 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%7) flags(ReadOnly) : !flow.dispatch.tensor> - %13 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor> - %14 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%9) : !flow.dispatch.tensor> - %15 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0], sizes = [11008, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<11008x32x128xi4> - %16 = flow.dispatch.tensor.load %11, offsets = [0, 0], sizes = [11008, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<11008x32xf32> - %17 = flow.dispatch.tensor.load %12, offsets = [0, 0], sizes = [11008, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<11008x32xf32> - %18 = flow.dispatch.tensor.load %13, offsets = [0, 0, 0], sizes = [512, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<512x32x128xf32> - %19 = tensor.empty() : tensor<512x11008xf32> - %20 = tensor.empty() : tensor<11008x32x128xf32> - %21 = linalg.fill ins(%cst : f32) outs(%19 : tensor<512x11008xf32>) -> tensor<512x11008xf32> - %22 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%15, %16, %17 : tensor<11008x32x128xi4>, tensor<11008x32xf32>, tensor<11008x32xf32>) outs(%20 : tensor<11008x32x128xf32>) { - ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32): - %24 = arith.extui %in : i4 to i32 - %25 = arith.uitofp %24 : i32 to f32 - %26 = arith.subf %25, %in_1 : f32 - %27 = arith.mulf %26, %in_0 : f32 - linalg.yield %27 : f32 - } -> tensor<11008x32x128xf32> - %23 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%18, %22 : tensor<512x32x128xf32>, tensor<11008x32x128xf32>) outs(%21 : tensor<512x11008xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %24 = arith.mulf %in, %in_0 : f32 - %25 = arith.addf %24, %out : f32 - linalg.yield %25 : f32 - } -> tensor<512x11008xf32> - flow.dispatch.tensor.store %23, %14, offsets = [0, 0], sizes = [512, 11008], strides = [1, 1] : tensor<512x11008xf32> -> !flow.dispatch.tensor> - return - } +func.func @matmul_multi_reduce_i4xf32xf32() { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 + %5 = arith.index_castui %0 : i32 to index + %6 = arith.index_castui %1 : i32 to index + %7 = arith.index_castui %2 : i32 to index + %8 = arith.index_castui %3 : i32 to index + %9 = arith.index_castui %4 : i32 to index + %10 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor> + %11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor> + %12 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%7) flags(ReadOnly) : !flow.dispatch.tensor> + %13 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor> + %14 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%9) : !flow.dispatch.tensor> + %15 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0], sizes = [11008, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<11008x32x128xi4> + %16 = flow.dispatch.tensor.load %11, offsets = [0, 0], sizes = [11008, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<11008x32xf32> + %17 = flow.dispatch.tensor.load %12, offsets = [0, 0], sizes = [11008, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<11008x32xf32> + %18 = flow.dispatch.tensor.load %13, offsets = [0, 0, 0], sizes = [512, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<512x32x128xf32> + %19 = tensor.empty() : tensor<512x11008xf32> + %20 = tensor.empty() : tensor<11008x32x128xf32> + %21 = linalg.fill ins(%cst : f32) outs(%19 : tensor<512x11008xf32>) -> tensor<512x11008xf32> + %22 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%15, %16, %17 : tensor<11008x32x128xi4>, tensor<11008x32xf32>, tensor<11008x32xf32>) outs(%20 : tensor<11008x32x128xf32>) { + ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32): + %24 = arith.extui %in : i4 to i32 + %25 = arith.uitofp %24 : i32 to f32 + %26 = arith.subf %25, %in_1 : f32 + %27 = arith.mulf %26, %in_0 : f32 + linalg.yield %27 : f32 + } -> tensor<11008x32x128xf32> + %23 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%18, %22 : tensor<512x32x128xf32>, tensor<11008x32x128xf32>) outs(%21 : tensor<512x11008xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %24 = arith.mulf %in, %in_0 : f32 + %25 = arith.addf %24, %out : f32 + linalg.yield %25 : f32 + } -> tensor<512x11008xf32> + flow.dispatch.tensor.store %23, %14, offsets = [0, 0], sizes = [512, 11008], strides = [1, 1] : tensor<512x11008xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul_cooperative_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul_cooperative_ops.mlir index 6e2d836595292..c0ac97ccca82e 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul_cooperative_ops.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul_cooperative_ops.mlir @@ -1,34 +1,41 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=rdna3@vulkan --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer> + ]> +]> #map = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @matmul_256x1024x128_div_add() { - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %c256 = arith.constant 256 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor> - %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf16> - %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf16> - %7 = tensor.empty() : tensor<256x1024xf16> - %8 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x128xf16> - %9 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x1024xf16> - %10 = tensor.empty() : tensor<256x1024xf16> - %11 = linalg.fill ins(%cst : f16) outs(%10 : tensor<256x1024xf16>) -> tensor<256x1024xf16> - %12 = linalg.matmul ins(%8, %9 : tensor<256x128xf16>, tensor<128x1024xf16>) outs(%11 : tensor<256x1024xf16>) -> tensor<256x1024xf16> - %13 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%12, %5, %6 : tensor<256x1024xf16>, tensor<256x1024xf16>, tensor<256x1024xf16>) outs(%7 : tensor<256x1024xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %14 = arith.divf %in, %in_0 : f16 - %15 = arith.addf %14, %in_1 : f16 - linalg.yield %15 : f16 - } -> tensor<256x1024xf16> - flow.dispatch.tensor.store %13, %4, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : tensor<256x1024xf16> -> !flow.dispatch.tensor> - return - } +func.func @matmul_256x1024x128_div_add() { + %c0 = arith.constant 0 : index + %c1024 = arith.constant 1024 : index + %c256 = arith.constant 256 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) : !flow.dispatch.tensor> + %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf16> + %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf16> + %7 = tensor.empty() : tensor<256x1024xf16> + %8 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x128xf16> + %9 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x1024xf16> + %10 = tensor.empty() : tensor<256x1024xf16> + %11 = linalg.fill ins(%cst : f16) outs(%10 : tensor<256x1024xf16>) -> tensor<256x1024xf16> + %12 = linalg.matmul ins(%8, %9 : tensor<256x128xf16>, tensor<128x1024xf16>) outs(%11 : tensor<256x1024xf16>) -> tensor<256x1024xf16> + %13 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%12, %5, %6 : tensor<256x1024xf16>, tensor<256x1024xf16>, tensor<256x1024xf16>) outs(%7 : tensor<256x1024xf16>) { + ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): + %14 = arith.divf %in, %in_0 : f16 + %15 = arith.addf %14, %in_1 : f16 + linalg.yield %15 : f16 + } -> tensor<256x1024xf16> + flow.dispatch.tensor.store %13, %4, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : tensor<256x1024xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -40,29 +47,35 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> -module { - func.func @batch_matmul_16x128x256x512_div() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 128, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x128x512xf16> - %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x512x256xf16> - %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [16, 128, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x128x256xf16> - %7 = tensor.empty() : tensor<16x128x256xf16> - %8 = linalg.fill ins(%cst : f16) outs(%7 : tensor<16x128x256xf16>) -> tensor<16x128x256xf16> - %9 = linalg.batch_matmul ins(%4, %5 : tensor<16x128x512xf16>, tensor<16x512x256xf16>) outs(%8 : tensor<16x128x256xf16>) -> tensor<16x128x256xf16> - %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%9, %6 : tensor<16x128x256xf16>, tensor<16x128x256xf16>) outs(%7 : tensor<16x128x256xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %11 = arith.divf %in, %in_0 : f16 - linalg.yield %11 : f16 - } -> tensor<16x128x256xf16> - flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0], sizes = [16, 128, 256], strides = [1, 1, 1] : tensor<16x128x256xf16> -> !flow.dispatch.tensor> - return - } +func.func @batch_matmul_16x128x256x512_div() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 128, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x128x512xf16> + %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x512x256xf16> + %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [16, 128, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x128x256xf16> + %7 = tensor.empty() : tensor<16x128x256xf16> + %8 = linalg.fill ins(%cst : f16) outs(%7 : tensor<16x128x256xf16>) -> tensor<16x128x256xf16> + %9 = linalg.batch_matmul ins(%4, %5 : tensor<16x128x512xf16>, tensor<16x512x256xf16>) outs(%8 : tensor<16x128x256xf16>) -> tensor<16x128x256xf16> + %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%9, %6 : tensor<16x128x256xf16>, tensor<16x128x256xf16>) outs(%7 : tensor<16x128x256xf16>) { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %11 = arith.divf %in, %in_0 : f16 + linalg.yield %11 : f16 + } -> tensor<16x128x256xf16> + flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0], sizes = [16, 128, 256], strides = [1, 1, 1] : tensor<16x128x256xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -76,32 +89,37 @@ module { // Linalg.generic that is a batch matmul. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> #map3 = affine_map<(d0, d1, d2) -> (d0, d2)> #map4 = affine_map<(d0, d1, d2) -> (d2, d1)> #map5 = affine_map<(d0, d1, d2) -> (d0, d1)> -module { - func.func @generic_batch_matmul_32x8x512x64() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x32x64xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [32, 64, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<32x64x512xf16> - %5 = tensor.empty() : tensor<32x128x512xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<32x128x512xf16>) -> tensor<32x128x512xf16> - %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<128x32x64xf16>, tensor<32x64x512xf16>) outs(%6 : tensor<32x128x512xf16>) attrs = {linalg.memoized_indexing_maps = [#map3, #map4, #map5]} { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %8 = arith.mulf %in, %in_0 : f16 - %9 = arith.addf %out, %8 : f16 - linalg.yield %9 : f16 - } -> tensor<32x128x512xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [32, 128, 512], strides = [1, 1, 1] : tensor<32x128x512xf16> -> !flow.dispatch.tensor> - return - } +func.func @generic_batch_matmul_32x8x512x64() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x32x64xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [32, 64, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<32x64x512xf16> + %5 = tensor.empty() : tensor<32x128x512xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<32x128x512xf16>) -> tensor<32x128x512xf16> + %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<128x32x64xf16>, tensor<32x64x512xf16>) outs(%6 : tensor<32x128x512xf16>) attrs = {linalg.memoized_indexing_maps = [#map3, #map4, #map5]} { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %8 = arith.mulf %in, %in_0 : f16 + %9 = arith.addf %out, %8 : f16 + linalg.yield %9 : f16 + } -> tensor<32x128x512xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [32, 128, 512], strides = [1, 1, 1] : tensor<32x128x512xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -115,21 +133,26 @@ module { // K dim size not divisble by 32. -module { - func.func @batch_matmul_16x1024x1024x80() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 1024, 80], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x1024x80xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 80, 1024], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x80x1024xf16> - %5 = tensor.empty() : tensor<16x1024x1024xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<16x1024x1024xf16>) -> tensor<16x1024x1024xf16> - %7 = linalg.batch_matmul ins(%3, %4 : tensor<16x1024x80xf16>, tensor<16x80x1024xf16>) outs(%6 : tensor<16x1024x1024xf16>) -> tensor<16x1024x1024xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [16, 1024, 1024], strides = [1, 1, 1] : tensor<16x1024x1024xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @batch_matmul_16x1024x1024x80() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 1024, 80], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x1024x80xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 80, 1024], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x80x1024xf16> + %5 = tensor.empty() : tensor<16x1024x1024xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<16x1024x1024xf16>) -> tensor<16x1024x1024xf16> + %7 = linalg.batch_matmul ins(%3, %4 : tensor<16x1024x80xf16>, tensor<16x80x1024xf16>) outs(%6 : tensor<16x1024x1024xf16>) -> tensor<16x1024x1024xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [16, 1024, 1024], strides = [1, 1, 1] : tensor<16x1024x1024xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -143,23 +166,28 @@ module { // Small K - not supported by cooperative matrix. -module { - func.func @matmul_256x1024x8() { - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %c256 = arith.constant 256 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 8], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x8xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<8x1024xf16> - %5 = tensor.empty() : tensor<256x1024xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<256x1024xf16>) -> tensor<256x1024xf16> - %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<256x8xf16>, tensor<8x1024xf16>) outs(%6 : tensor<256x1024xf16>) -> tensor<256x1024xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : tensor<256x1024xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_256x1024x8() { + %c0 = arith.constant 0 : index + %c1024 = arith.constant 1024 : index + %c256 = arith.constant 256 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 8], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x8xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<8x1024xf16> + %5 = tensor.empty() : tensor<256x1024xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<256x1024xf16>) -> tensor<256x1024xf16> + %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<256x8xf16>, tensor<8x1024xf16>) outs(%6 : tensor<256x1024xf16>) -> tensor<256x1024xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : tensor<256x1024xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matvec.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matvec.mlir index 7b0480a169c6f..fcf53412841f0 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matvec.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matvec.mlir @@ -1,41 +1,48 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=cdna2@vulkan --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> #map2 = affine_map<(d0, d1, d2) -> (d1, d2)> #map3 = affine_map<(d0, d1, d2) -> (d0)> -module { - func.func @i4_dequant_matvec_f32() { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor> - %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x128xi4> - %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf32> - %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf32> - %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [86, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<86x128xf32> - %9 = tensor.empty() : tensor<4096xf32> - %10 = tensor.empty() : tensor<4096x86x128xf32> - %11 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4096xf32>) -> tensor<4096xf32> - %12 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%10 : tensor<4096x86x128xf32>) { - ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32): - %14 = arith.extui %in : i4 to i32 - %15 = arith.uitofp %14 : i32 to f32 - %16 = arith.subf %15, %in_1 : f32 - %17 = arith.mulf %16, %in_0 : f32 - linalg.yield %17 : f32 - } -> tensor<4096x86x128xf32> - %13 = linalg.generic {indexing_maps = [#map2, #map, #map3], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<86x128xf32>, tensor<4096x86x128xf32>) outs(%11 : tensor<4096xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %14 = arith.mulf %in, %in_0 : f32 - %15 = arith.addf %14, %out : f32 - linalg.yield %15 : f32 - } -> tensor<4096xf32> - flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf32> -> !flow.dispatch.tensor> - return - } +func.func @i4_dequant_matvec_f32() { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) : !flow.dispatch.tensor> + %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x128xi4> + %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf32> + %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf32> + %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [86, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<86x128xf32> + %9 = tensor.empty() : tensor<4096xf32> + %10 = tensor.empty() : tensor<4096x86x128xf32> + %11 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4096xf32>) -> tensor<4096xf32> + %12 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%10 : tensor<4096x86x128xf32>) { + ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32): + %14 = arith.extui %in : i4 to i32 + %15 = arith.uitofp %14 : i32 to f32 + %16 = arith.subf %15, %in_1 : f32 + %17 = arith.mulf %16, %in_0 : f32 + linalg.yield %17 : f32 + } -> tensor<4096x86x128xf32> + %13 = linalg.generic {indexing_maps = [#map2, #map, #map3], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<86x128xf32>, tensor<4096x86x128xf32>) outs(%11 : tensor<4096xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %14 = arith.mulf %in, %in_0 : f32 + %15 = arith.addf %14, %out : f32 + linalg.yield %15 : f32 + } -> tensor<4096xf32> + flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -47,45 +54,50 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1, 0)> #map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)> #map3 = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)> #map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> -module { - func.func @i4_dequant_matvec_f32() { - %c32_i64 = arith.constant 32 : i64 - %cst = arith.constant 0.000000e+00 : f32 - %c4294967296_i64 = arith.constant 4294967296 : i64 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x32x128xi4> - %6 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4096, 32, 1], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x32x1xf32> - %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [4096, 32, 1], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x32x1xf32> - %8 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [1, 1, 32, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x1x32x128xf32> - %9 = tensor.empty() : tensor<1x1x4096xf32> - %10 = tensor.empty() : tensor<4096x32x128xf32> - %11 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x1x4096xf32>) -> tensor<1x1x4096xf32> - %12 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x32x128xi4>, tensor<4096x32x1xf32>, tensor<4096x32x1xf32>) outs(%10 : tensor<4096x32x128xf32>) { - ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32): - %14 = arith.extui %in : i4 to i32 - %15 = arith.uitofp %14 : i32 to f32 - %16 = arith.subf %15, %in_1 : f32 - %17 = arith.mulf %16, %in_0 : f32 - linalg.yield %17 : f32 - } -> tensor<4096x32x128xf32> - %13 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<1x1x32x128xf32>, tensor<4096x32x128xf32>) outs(%11 : tensor<1x1x4096xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %14 = arith.mulf %in, %in_0 : f32 - %15 = arith.addf %14, %out : f32 - linalg.yield %15 : f32 - } -> tensor<1x1x4096xf32> - flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0], sizes = [1, 1, 4096], strides = [1, 1, 1] : tensor<1x1x4096xf32> -> !flow.dispatch.tensor> - return - } +func.func @i4_dequant_matvec_f32() { + %c32_i64 = arith.constant 32 : i64 + %cst = arith.constant 0.000000e+00 : f32 + %c4294967296_i64 = arith.constant 4294967296 : i64 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x32x128xi4> + %6 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4096, 32, 1], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x32x1xf32> + %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [4096, 32, 1], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x32x1xf32> + %8 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [1, 1, 32, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x1x32x128xf32> + %9 = tensor.empty() : tensor<1x1x4096xf32> + %10 = tensor.empty() : tensor<4096x32x128xf32> + %11 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x1x4096xf32>) -> tensor<1x1x4096xf32> + %12 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x32x128xi4>, tensor<4096x32x1xf32>, tensor<4096x32x1xf32>) outs(%10 : tensor<4096x32x128xf32>) { + ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32): + %14 = arith.extui %in : i4 to i32 + %15 = arith.uitofp %14 : i32 to f32 + %16 = arith.subf %15, %in_1 : f32 + %17 = arith.mulf %16, %in_0 : f32 + linalg.yield %17 : f32 + } -> tensor<4096x32x128xf32> + %13 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<1x1x32x128xf32>, tensor<4096x32x128xf32>) outs(%11 : tensor<1x1x4096xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %14 = arith.mulf %in, %in_0 : f32 + %15 = arith.addf %14, %out : f32 + linalg.yield %15 : f32 + } -> tensor<1x1x4096xf32> + flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0], sizes = [1, 1, 4096], strides = [1, 1, 1] : tensor<1x1x4096xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -97,72 +109,78 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)> #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)> #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> -module { - func.func @i4_dequant_matvec_f32() { - %c32_i64 = arith.constant 32 : i64 - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = hal.interface.constant.load[4] : i32 - %5 = hal.interface.constant.load[5] : i32 - %6 = hal.interface.constant.load[6] : i32 - %7 = hal.interface.constant.load[7] : i32 - %8 = hal.interface.constant.load[8] : i32 - %9 = arith.index_castui %0 : i32 to index - %10 = arith.index_castui %1 : i32 to index - %11 = arith.index_castui %2 : i32 to index - %12 = arith.extui %3 : i32 to i64 - %13 = arith.extui %4 : i32 to i64 - %14 = arith.shli %13, %c32_i64 : i64 - %15 = arith.ori %12, %14 : i64 - %16 = arith.index_castui %15 : i64 to index - %17 = arith.extui %5 : i32 to i64 - %18 = arith.extui %6 : i32 to i64 - %19 = arith.shli %18, %c32_i64 : i64 - %20 = arith.ori %17, %19 : i64 - %21 = arith.index_castui %20 : i64 to index - %22 = arith.extui %7 : i32 to i64 - %23 = arith.extui %8 : i32 to i64 - %24 = arith.shli %23, %c32_i64 : i64 - %25 = arith.ori %22, %24 : i64 - %26 = arith.index_castui %25 : i64 to index - %27 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%9) flags(ReadOnly) : !flow.dispatch.tensor> - %28 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%10) flags(ReadOnly) : !flow.dispatch.tensor> - %29 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%11) flags(ReadOnly) : !flow.dispatch.tensor> - %30 = flow.dispatch.workload.ordinal %26, 0 : index - %31 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%16) flags(ReadOnly) : !flow.dispatch.tensor>{%30} - %32 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%21) : !flow.dispatch.tensor>{%30} - %33 = flow.dispatch.tensor.load %27, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x128xi4> - %34 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf32> - %35 = flow.dispatch.tensor.load %29, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf32> - %36 = flow.dispatch.tensor.load %31, offsets = [0, 0, 0], sizes = [%30, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor>{%30} -> tensor - %37 = tensor.empty(%30) : tensor - %38 = tensor.empty() : tensor<4096x86x128xf32> - %39 = linalg.fill ins(%cst : f32) outs(%37 : tensor) -> tensor - %40 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%33, %34, %35 : tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%38 : tensor<4096x86x128xf32>) { - ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32): - %42 = arith.extui %in : i4 to i32 - %43 = arith.uitofp %42 : i32 to f32 - %44 = arith.subf %43, %in_1 : f32 - %45 = arith.mulf %44, %in_0 : f32 - linalg.yield %45 : f32 - } -> tensor<4096x86x128xf32> - %41 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%36, %40 : tensor, tensor<4096x86x128xf32>) outs(%39 : tensor) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %42 = arith.mulf %in, %in_0 : f32 - %43 = arith.addf %42, %out : f32 - linalg.yield %43 : f32 - } -> tensor - flow.dispatch.tensor.store %41, %32, offsets = [0, 0], sizes = [%30, 4096], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%30} - return - } +func.func @i4_dequant_matvec_f32() { + %c32_i64 = arith.constant 32 : i64 + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 + %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32 + %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32 + %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32 + %8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32 + %9 = arith.index_castui %0 : i32 to index + %10 = arith.index_castui %1 : i32 to index + %11 = arith.index_castui %2 : i32 to index + %12 = arith.extui %3 : i32 to i64 + %13 = arith.extui %4 : i32 to i64 + %14 = arith.shli %13, %c32_i64 : i64 + %15 = arith.ori %12, %14 : i64 + %16 = arith.index_castui %15 : i64 to index + %17 = arith.extui %5 : i32 to i64 + %18 = arith.extui %6 : i32 to i64 + %19 = arith.shli %18, %c32_i64 : i64 + %20 = arith.ori %17, %19 : i64 + %21 = arith.index_castui %20 : i64 to index + %22 = arith.extui %7 : i32 to i64 + %23 = arith.extui %8 : i32 to i64 + %24 = arith.shli %23, %c32_i64 : i64 + %25 = arith.ori %22, %24 : i64 + %26 = arith.index_castui %25 : i64 to index + %27 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%9) flags(ReadOnly) : !flow.dispatch.tensor> + %28 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%10) flags(ReadOnly) : !flow.dispatch.tensor> + %29 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%11) flags(ReadOnly) : !flow.dispatch.tensor> + %30 = flow.dispatch.workload.ordinal %26, 0 : index + %31 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%16) flags(ReadOnly) : !flow.dispatch.tensor>{%30} + %32 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%21) : !flow.dispatch.tensor>{%30} + %33 = flow.dispatch.tensor.load %27, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x128xi4> + %34 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf32> + %35 = flow.dispatch.tensor.load %29, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf32> + %36 = flow.dispatch.tensor.load %31, offsets = [0, 0, 0], sizes = [%30, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor>{%30} -> tensor + %37 = tensor.empty(%30) : tensor + %38 = tensor.empty() : tensor<4096x86x128xf32> + %39 = linalg.fill ins(%cst : f32) outs(%37 : tensor) -> tensor + %40 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%33, %34, %35 : tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%38 : tensor<4096x86x128xf32>) { + ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32): + %42 = arith.extui %in : i4 to i32 + %43 = arith.uitofp %42 : i32 to f32 + %44 = arith.subf %43, %in_1 : f32 + %45 = arith.mulf %44, %in_0 : f32 + linalg.yield %45 : f32 + } -> tensor<4096x86x128xf32> + %41 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%36, %40 : tensor, tensor<4096x86x128xf32>) outs(%39 : tensor) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %42 = arith.mulf %in, %in_0 : f32 + %43 = arith.addf %42, %out : f32 + linalg.yield %43 : f32 + } -> tensor + flow.dispatch.tensor.store %41, %32, offsets = [0, 0], sizes = [%30, 4096], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%30} + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -174,44 +192,51 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1, 0)> #map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)> #map3 = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)> #map4 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> -module { - func.func @i4_dequant_matvec_f16() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x128xi4> - %6 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4096, 86, 1], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x1xf16> - %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [4096, 86, 1], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x1xf16> - %8 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [1, 1, 86, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x1x86x128xf16> - %9 = tensor.empty() : tensor<1x1x4096xf16> - %10 = tensor.empty() : tensor<4096x86x128xf16> - %11 = linalg.fill ins(%cst : f16) outs(%9 : tensor<1x1x4096xf16>) -> tensor<1x1x4096xf16> - %12 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x86x128xi4>, tensor<4096x86x1xf16>, tensor<4096x86x1xf16>) outs(%10 : tensor<4096x86x128xf16>) { - ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16): - %14 = arith.extui %in : i4 to i32 - %15 = arith.uitofp %14 : i32 to f16 - %16 = arith.subf %15, %in_1 : f16 - %17 = arith.mulf %16, %in_0 : f16 - linalg.yield %17 : f16 - } -> tensor<4096x86x128xf16> - %13 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<1x1x86x128xf16>, tensor<4096x86x128xf16>) outs(%11 : tensor<1x1x4096xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %14 = arith.mulf %in, %in_0 : f16 - %15 = arith.addf %14, %out : f16 - linalg.yield %15 : f16 - } -> tensor<1x1x4096xf16> - flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0], sizes = [1, 1, 4096], strides = [1, 1, 1] : tensor<1x1x4096xf16> -> !flow.dispatch.tensor> - return - } +func.func @i4_dequant_matvec_f16() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x128xi4> + %6 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4096, 86, 1], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x1xf16> + %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [4096, 86, 1], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x1xf16> + %8 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [1, 1, 86, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x1x86x128xf16> + %9 = tensor.empty() : tensor<1x1x4096xf16> + %10 = tensor.empty() : tensor<4096x86x128xf16> + %11 = linalg.fill ins(%cst : f16) outs(%9 : tensor<1x1x4096xf16>) -> tensor<1x1x4096xf16> + %12 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x86x128xi4>, tensor<4096x86x1xf16>, tensor<4096x86x1xf16>) outs(%10 : tensor<4096x86x128xf16>) { + ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16): + %14 = arith.extui %in : i4 to i32 + %15 = arith.uitofp %14 : i32 to f16 + %16 = arith.subf %15, %in_1 : f16 + %17 = arith.mulf %16, %in_0 : f16 + linalg.yield %17 : f16 + } -> tensor<4096x86x128xf16> + %13 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<1x1x86x128xf16>, tensor<4096x86x128xf16>) outs(%11 : tensor<1x1x4096xf16>) { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %14 = arith.mulf %in, %in_0 : f16 + %15 = arith.addf %14, %out : f16 + linalg.yield %15 : f16 + } -> tensor<1x1x4096xf16> + flow.dispatch.tensor.store %13, %4, offsets = [0, 0, 0], sizes = [1, 1, 4096], strides = [1, 1, 1] : tensor<1x1x4096xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -223,72 +248,79 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)> #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)> #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> -module { - func.func @i4_dequant_matvec() { - %c32_i64 = arith.constant 32 : i64 - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = hal.interface.constant.load[4] : i32 - %5 = hal.interface.constant.load[5] : i32 - %6 = hal.interface.constant.load[6] : i32 - %7 = hal.interface.constant.load[7] : i32 - %8 = hal.interface.constant.load[8] : i32 - %9 = arith.index_castui %0 : i32 to index - %10 = arith.index_castui %1 : i32 to index - %11 = arith.index_castui %2 : i32 to index - %12 = arith.extui %3 : i32 to i64 - %13 = arith.extui %4 : i32 to i64 - %14 = arith.shli %13, %c32_i64 : i64 - %15 = arith.ori %12, %14 : i64 - %16 = arith.index_castui %15 : i64 to index - %17 = arith.extui %5 : i32 to i64 - %18 = arith.extui %6 : i32 to i64 - %19 = arith.shli %18, %c32_i64 : i64 - %20 = arith.ori %17, %19 : i64 - %21 = arith.index_castui %20 : i64 to index - %22 = arith.extui %7 : i32 to i64 - %23 = arith.extui %8 : i32 to i64 - %24 = arith.shli %23, %c32_i64 : i64 - %25 = arith.ori %22, %24 : i64 - %26 = arith.index_castui %25 : i64 to index - %27 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%9) flags(ReadOnly) : !flow.dispatch.tensor> - %28 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%10) flags(ReadOnly) : !flow.dispatch.tensor> - %29 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%11) flags(ReadOnly) : !flow.dispatch.tensor> - %30 = flow.dispatch.workload.ordinal %26, 0 : index - %31 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%16) flags(ReadOnly) : !flow.dispatch.tensor>{%30} - %32 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%21) : !flow.dispatch.tensor>{%30} - %33 = flow.dispatch.tensor.load %27, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x128xi4> - %34 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf32> - %35 = flow.dispatch.tensor.load %29, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf32> - %36 = flow.dispatch.tensor.load %31, offsets = [0, 0, 0], sizes = [%30, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor>{%30} -> tensor - %37 = tensor.empty(%30) : tensor - %38 = tensor.empty() : tensor<4096x86x128xf32> - %39 = linalg.fill ins(%cst : f32) outs(%37 : tensor) -> tensor - %40 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%33, %34, %35 : tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%38 : tensor<4096x86x128xf32>) { - ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32): - %42 = arith.extui %in : i4 to i32 - %43 = arith.uitofp %42 : i32 to f32 - %44 = arith.subf %43, %in_1 : f32 - %45 = arith.mulf %44, %in_0 : f32 - linalg.yield %45 : f32 - } -> tensor<4096x86x128xf32> - %41 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%36, %40 : tensor, tensor<4096x86x128xf32>) outs(%39 : tensor) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %42 = arith.mulf %in, %in_0 : f32 - %43 = arith.addf %42, %out : f32 - linalg.yield %43 : f32 - } -> tensor - flow.dispatch.tensor.store %41, %32, offsets = [0, 0], sizes = [%30, 4096], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%30} - return - } +func.func @i4_dequant_matvec() { + %c32_i64 = arith.constant 32 : i64 + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 + %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32 + %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32 + %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32 + %8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32 + %9 = arith.index_castui %0 : i32 to index + %10 = arith.index_castui %1 : i32 to index + %11 = arith.index_castui %2 : i32 to index + %12 = arith.extui %3 : i32 to i64 + %13 = arith.extui %4 : i32 to i64 + %14 = arith.shli %13, %c32_i64 : i64 + %15 = arith.ori %12, %14 : i64 + %16 = arith.index_castui %15 : i64 to index + %17 = arith.extui %5 : i32 to i64 + %18 = arith.extui %6 : i32 to i64 + %19 = arith.shli %18, %c32_i64 : i64 + %20 = arith.ori %17, %19 : i64 + %21 = arith.index_castui %20 : i64 to index + %22 = arith.extui %7 : i32 to i64 + %23 = arith.extui %8 : i32 to i64 + %24 = arith.shli %23, %c32_i64 : i64 + %25 = arith.ori %22, %24 : i64 + %26 = arith.index_castui %25 : i64 to index + %27 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%9) flags(ReadOnly) : !flow.dispatch.tensor> + %28 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%10) flags(ReadOnly) : !flow.dispatch.tensor> + %29 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%11) flags(ReadOnly) : !flow.dispatch.tensor> + %30 = flow.dispatch.workload.ordinal %26, 0 : index + %31 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%16) flags(ReadOnly) : !flow.dispatch.tensor>{%30} + %32 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%21) : !flow.dispatch.tensor>{%30} + %33 = flow.dispatch.tensor.load %27, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x128xi4> + %34 = flow.dispatch.tensor.load %28, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf32> + %35 = flow.dispatch.tensor.load %29, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf32> + %36 = flow.dispatch.tensor.load %31, offsets = [0, 0, 0], sizes = [%30, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor>{%30} -> tensor + %37 = tensor.empty(%30) : tensor + %38 = tensor.empty() : tensor<4096x86x128xf32> + %39 = linalg.fill ins(%cst : f32) outs(%37 : tensor) -> tensor + %40 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%33, %34, %35 : tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%38 : tensor<4096x86x128xf32>) { + ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32): + %42 = arith.extui %in : i4 to i32 + %43 = arith.uitofp %42 : i32 to f32 + %44 = arith.subf %43, %in_1 : f32 + %45 = arith.mulf %44, %in_0 : f32 + linalg.yield %45 : f32 + } -> tensor<4096x86x128xf32> + %41 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%36, %40 : tensor, tensor<4096x86x128xf32>) outs(%39 : tensor) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %42 = arith.mulf %in, %in_0 : f32 + %43 = arith.addf %42, %out : f32 + linalg.yield %43 : f32 + } -> tensor + flow.dispatch.tensor.store %41, %32, offsets = [0, 0], sizes = [%30, 4096], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%30} + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -300,66 +332,71 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)> #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)> #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> -module { - func.func @i4_dequant_matvec() { - %c32_i64 = arith.constant 32 : i64 - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = hal.interface.constant.load[4] : i32 - %5 = hal.interface.constant.load[5] : i32 - %6 = hal.interface.constant.load[6] : i32 - %7 = arith.index_castui %0 : i32 to index - %8 = arith.index_castui %1 : i32 to index - %9 = arith.index_castui %2 : i32 to index - %10 = arith.extui %3 : i32 to i64 - %11 = arith.extui %4 : i32 to i64 - %12 = arith.shli %11, %c32_i64 : i64 - %13 = arith.ori %10, %12 : i64 - %14 = arith.index_castui %13 : i64 to index - %15 = arith.extui %5 : i32 to i64 - %16 = arith.extui %6 : i32 to i64 - %17 = arith.shli %16, %c32_i64 : i64 - %18 = arith.ori %15, %17 : i64 - %19 = arith.index_castui %18 : i64 to index - %20 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%7) flags(ReadOnly) : !flow.dispatch.tensor> - %21 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor> - %22 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%9) flags(ReadOnly) : !flow.dispatch.tensor> - %23 = flow.dispatch.workload.ordinal %19, 0 : index - %24 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%23} - %25 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%14) : !flow.dispatch.tensor>{%23} - %26 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0], sizes = [11008, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<11008x32x128xi4> - %27 = flow.dispatch.tensor.load %21, offsets = [0, 0], sizes = [11008, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<11008x32xf16> - %28 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [11008, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<11008x32xf16> - %29 = flow.dispatch.tensor.load %24, offsets = [0, 0, 0], sizes = [%23, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor>{%23} -> tensor - %30 = tensor.empty() : tensor<11008x32x128xf16> - %31 = tensor.empty(%23) : tensor - %32 = linalg.fill ins(%cst : f16) outs(%31 : tensor) -> tensor - %33 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%26, %27, %28 : tensor<11008x32x128xi4>, tensor<11008x32xf16>, tensor<11008x32xf16>) outs(%30 : tensor<11008x32x128xf16>) { - ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16): - %35 = arith.extui %in : i4 to i32 - %36 = arith.uitofp %35 : i32 to f16 - %37 = arith.subf %36, %in_1 : f16 - %38 = arith.mulf %37, %in_0 : f16 - linalg.yield %38 : f16 - } -> tensor<11008x32x128xf16> - %34 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%29, %33 : tensor, tensor<11008x32x128xf16>) outs(%32 : tensor) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %35 = arith.mulf %in, %in_0 : f16 - %36 = arith.addf %35, %out : f16 - linalg.yield %36 : f16 - } -> tensor - flow.dispatch.tensor.store %34, %25, offsets = [0, 0], sizes = [%23, 11008], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%23} - return - } +func.func @i4_dequant_matvec() { + %c32_i64 = arith.constant 32 : i64 + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 + %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32 + %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32 + %7 = arith.index_castui %0 : i32 to index + %8 = arith.index_castui %1 : i32 to index + %9 = arith.index_castui %2 : i32 to index + %10 = arith.extui %3 : i32 to i64 + %11 = arith.extui %4 : i32 to i64 + %12 = arith.shli %11, %c32_i64 : i64 + %13 = arith.ori %10, %12 : i64 + %14 = arith.index_castui %13 : i64 to index + %15 = arith.extui %5 : i32 to i64 + %16 = arith.extui %6 : i32 to i64 + %17 = arith.shli %16, %c32_i64 : i64 + %18 = arith.ori %15, %17 : i64 + %19 = arith.index_castui %18 : i64 to index + %20 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%7) flags(ReadOnly) : !flow.dispatch.tensor> + %21 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor> + %22 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%9) flags(ReadOnly) : !flow.dispatch.tensor> + %23 = flow.dispatch.workload.ordinal %19, 0 : index + %24 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%23} + %25 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%14) : !flow.dispatch.tensor>{%23} + %26 = flow.dispatch.tensor.load %20, offsets = [0, 0, 0], sizes = [11008, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<11008x32x128xi4> + %27 = flow.dispatch.tensor.load %21, offsets = [0, 0], sizes = [11008, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<11008x32xf16> + %28 = flow.dispatch.tensor.load %22, offsets = [0, 0], sizes = [11008, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<11008x32xf16> + %29 = flow.dispatch.tensor.load %24, offsets = [0, 0, 0], sizes = [%23, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor>{%23} -> tensor + %30 = tensor.empty() : tensor<11008x32x128xf16> + %31 = tensor.empty(%23) : tensor + %32 = linalg.fill ins(%cst : f16) outs(%31 : tensor) -> tensor + %33 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%26, %27, %28 : tensor<11008x32x128xi4>, tensor<11008x32xf16>, tensor<11008x32xf16>) outs(%30 : tensor<11008x32x128xf16>) { + ^bb0(%in: i4, %in_0: f16, %in_1: f16, %out: f16): + %35 = arith.extui %in : i4 to i32 + %36 = arith.uitofp %35 : i32 to f16 + %37 = arith.subf %36, %in_1 : f16 + %38 = arith.mulf %37, %in_0 : f16 + linalg.yield %38 : f16 + } -> tensor<11008x32x128xf16> + %34 = linalg.generic {indexing_maps = [#map2, #map3, #map4], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%29, %33 : tensor, tensor<11008x32x128xf16>) outs(%32 : tensor) { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %35 = arith.mulf %in, %in_0 : f16 + %36 = arith.addf %35, %out : f16 + linalg.yield %36 : f16 + } -> tensor + flow.dispatch.tensor.store %34, %25, offsets = [0, 0], sizes = [%23, 11008], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%23} + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -371,33 +408,38 @@ module { // ----- -module { - func.func @dynamic_batch_matvec() { - %c32_i64 = arith.constant 32 : i64 - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = hal.interface.constant.load[4] : i32 - %5 = arith.index_castui %0 : i32 to index - %6 = arith.index_castui %1 : i32 to index - %7 = arith.index_castui %2 : i32 to index - %8 = arith.index_castui %3 : i32 to index - %9 = arith.index_castui %4 : i32 to index - %10 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%7) : !flow.dispatch.tensor> - %11 = flow.dispatch.workload.ordinal %8, 0 : index - %12 = flow.dispatch.workload.ordinal %9, 1 : index - %13 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor>{%11} - %14 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor>{%12} - %15 = flow.dispatch.tensor.load %13, offsets = [0, 0, 0], sizes = [32, 1, %11], strides = [1, 1, 1] : !flow.dispatch.tensor>{%11} -> tensor<32x1x?xf16> - %16 = flow.dispatch.tensor.load %14, offsets = [0, 0, 0], sizes = [32, %12, 128], strides = [1, 1, 1] : !flow.dispatch.tensor>{%12} -> tensor<32x?x128xf16> - %17 = tensor.empty() : tensor<32x1x128xf16> - %18 = linalg.fill ins(%cst : f16) outs(%17 : tensor<32x1x128xf16>) -> tensor<32x1x128xf16> - %19 = linalg.batch_matmul ins(%15, %16 : tensor<32x1x?xf16>, tensor<32x?x128xf16>) outs(%18 : tensor<32x1x128xf16>) -> tensor<32x1x128xf16> - flow.dispatch.tensor.store %19, %10, offsets = [0, 0, 0], sizes = [32, 1, 128], strides = [1, 1, 1] : tensor<32x1x128xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @dynamic_batch_matvec() { + %c32_i64 = arith.constant 32 : i64 + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 + %5 = arith.index_castui %0 : i32 to index + %6 = arith.index_castui %1 : i32 to index + %7 = arith.index_castui %2 : i32 to index + %8 = arith.index_castui %3 : i32 to index + %9 = arith.index_castui %4 : i32 to index + %10 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%7) : !flow.dispatch.tensor> + %11 = flow.dispatch.workload.ordinal %8, 0 : index + %12 = flow.dispatch.workload.ordinal %9, 1 : index + %13 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor>{%11} + %14 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor>{%12} + %15 = flow.dispatch.tensor.load %13, offsets = [0, 0, 0], sizes = [32, 1, %11], strides = [1, 1, 1] : !flow.dispatch.tensor>{%11} -> tensor<32x1x?xf16> + %16 = flow.dispatch.tensor.load %14, offsets = [0, 0, 0], sizes = [32, %12, 128], strides = [1, 1, 1] : !flow.dispatch.tensor>{%12} -> tensor<32x?x128xf16> + %17 = tensor.empty() : tensor<32x1x128xf16> + %18 = linalg.fill ins(%cst : f16) outs(%17 : tensor<32x1x128xf16>) -> tensor<32x1x128xf16> + %19 = linalg.batch_matmul ins(%15, %16 : tensor<32x1x?xf16>, tensor<32x?x128xf16>) outs(%18 : tensor<32x1x128xf16>) -> tensor<32x1x128xf16> + flow.dispatch.tensor.store %19, %10, offsets = [0, 0, 0], sizes = [32, 1, 128], strides = [1, 1, 1] : tensor<32x1x128xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_conv.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_conv.mlir index fc6da02488be0..88f9f723c0873 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_conv.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_conv.mlir @@ -1,32 +1,39 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=vp_android_baseline_2022@vulkan --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s -// Convolution with consumer pointwise ops +// Convolution with consumer pointwise ops. + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -module { - func.func @nhwc_conv_pointwise_112x112x32() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %c112 = arith.constant 112 : index - %c32 = arith.constant 32 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x112x112x32xf32> - %5 = tensor.empty() : tensor<1x112x112x32xf32> - %6 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x225x225x3xf32> - %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x3x32xf32> - %8 = tensor.empty() : tensor<1x112x112x32xf32> - %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> - %10 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%6, %7 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%9 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> - %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %4 : tensor<1x112x112x32xf32>, tensor<1x112x112x32xf32>) outs(%5 : tensor<1x112x112x32xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %12 = arith.subf %in, %in_0 : f32 - linalg.yield %12 : f32 - } -> tensor<1x112x112x32xf32> - flow.dispatch.tensor.store %11, %3, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : tensor<1x112x112x32xf32> -> !flow.dispatch.tensor> - return - } +func.func @nhwc_conv_pointwise_112x112x32() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %c112 = arith.constant 112 : index + %c32 = arith.constant 32 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x112x112x32xf32> + %5 = tensor.empty() : tensor<1x112x112x32xf32> + %6 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x225x225x3xf32> + %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x3x32xf32> + %8 = tensor.empty() : tensor<1x112x112x32xf32> + %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> + %10 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%6, %7 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%9 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> + %11 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10, %4 : tensor<1x112x112x32xf32>, tensor<1x112x112x32xf32>) outs(%5 : tensor<1x112x112x32xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %12 = arith.subf %in, %in_0 : f32 + linalg.yield %12 : f32 + } -> tensor<1x112x112x32xf32> + flow.dispatch.tensor.store %11, %3, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : tensor<1x112x112x32xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -38,19 +45,24 @@ module { // ----- -module { - func.func @nchw_conv_2x1280x8x8() { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 1280, 10, 10], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1280x10x10xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [1280, 1280, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1280x1280x3x3xf32> - %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [2, 1280, 8, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1280x8x8xf32> - %6 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%3, %4 : tensor<2x1280x10x10xf32>, tensor<1280x1280x3x3xf32>) outs(%5 : tensor<2x1280x8x8xf32>) -> tensor<2x1280x8x8xf32> - flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 1280, 8, 8], strides = [1, 1, 1, 1] : tensor<2x1280x8x8xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @nchw_conv_2x1280x8x8() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 1280, 10, 10], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1280x10x10xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [1280, 1280, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1280x1280x3x3xf32> + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0], sizes = [2, 1280, 8, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x1280x8x8xf32> + %6 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%3, %4 : tensor<2x1280x10x10xf32>, tensor<1280x1280x3x3xf32>) outs(%5 : tensor<2x1280x8x8xf32>) -> tensor<2x1280x8x8xf32> + flow.dispatch.tensor.store %6, %2, offsets = [0, 0, 0, 0], sizes = [2, 1280, 8, 8], strides = [1, 1, 1, 1] : tensor<2x1280x8x8xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ext_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ext_ops.mlir index 39a3ec1131fa4..29d8e0422f317 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ext_ops.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ext_ops.mlir @@ -1,18 +1,21 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=vp_android_baseline_2022@vulkan --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s -module { - func.func @static_1d_sort() { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %1 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [1000], strides = [1] : !flow.dispatch.tensor> -> tensor<1000xi32> - %2 = iree_linalg_ext.sort dimension(0) outs(%1 : tensor<1000xi32>) { - ^bb0(%arg0: i32, %arg1: i32): - %3 = arith.cmpi slt, %arg0, %arg1 : i32 - iree_linalg_ext.yield %3 : i1 - } -> tensor<1000xi32> - flow.dispatch.tensor.store %2, %0, offsets = [0], sizes = [1000], strides = [1] : tensor<1000xi32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout + ]> +]> +func.func @static_1d_sort() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %1 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [1000], strides = [1] : !flow.dispatch.tensor> -> tensor<1000xi32> + %2 = iree_linalg_ext.sort dimension(0) outs(%1 : tensor<1000xi32>) { + ^bb0(%arg0: i32, %arg1: i32): + %3 = arith.cmpi slt, %arg0, %arg1 : i32 + iree_linalg_ext.yield %3 : i1 + } -> tensor<1000xi32> + flow.dispatch.tensor.store %2, %0, offsets = [0], sizes = [1000], strides = [1] : tensor<1000xi32> -> !flow.dispatch.tensor> + return } // Check that the workgroup count and size are (1, 1, 1) for serializing the computation. @@ -25,25 +28,30 @@ module { // CHECK-SAME: lowering_config = #[[CONFIG]] // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> -module { - func.func @static_3d_sort() { - %c64 = arith.constant 64 : index - %c128 = arith.constant 128 : index - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x32x128xi32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<64x32x128xi32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : memref<64x32x128xi32>) outs(%1 : memref<64x32x128xi32>) { - ^bb0(%in: i32, %out: i32): - linalg.yield %in : i32 - } - iree_linalg_ext.sort dimension(1) outs(%1 : memref<64x32x128xi32>) { - ^bb0(%arg0: i32, %arg1: i32): - %2 = arith.cmpi slt, %arg0, %arg1 : i32 - iree_linalg_ext.yield %2 : i1 - } - return +func.func @static_3d_sort() { + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<64x32x128xi32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<64x32x128xi32> + linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : memref<64x32x128xi32>) outs(%1 : memref<64x32x128xi32>) { + ^bb0(%in: i32, %out: i32): + linalg.yield %in : i32 + } + iree_linalg_ext.sort dimension(1) outs(%1 : memref<64x32x128xi32>) { + ^bb0(%arg0: i32, %arg1: i32): + %2 = arith.cmpi slt, %arg0, %arg1 : i32 + iree_linalg_ext.yield %2 : i1 } + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -54,21 +62,26 @@ module { // CHECK-SAME: lowering_config = #[[CONFIG]] // ----- -module { - func.func @static_1d_fft_stage2() { - %c0 = arith.constant 0 : index - %c2 = arith.constant 2 : index - %cst = arith.constant dense<[1.000000e+00, 6.12323426E-17]> : tensor<2xf32> - %cst_0 = arith.constant dense<[-0.000000e+00, -1.000000e+00]> : tensor<2xf32> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xf32> - %3 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xf32> - %4:2 = iree_linalg_ext.fft ins(%c2, %cst, %cst_0 : index, tensor<2xf32>, tensor<2xf32>) outs(%2, %3 : tensor<32xf32>, tensor<32xf32>) : tensor<32xf32>, tensor<32xf32> - flow.dispatch.tensor.store %4#0, %0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor> - flow.dispatch.tensor.store %4#1, %1, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor> - return - } + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @static_1d_fft_stage2() { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %cst = arith.constant dense<[1.000000e+00, 6.12323426E-17]> : tensor<2xf32> + %cst_0 = arith.constant dense<[-0.000000e+00, -1.000000e+00]> : tensor<2xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xf32> + %3 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xf32> + %4:2 = iree_linalg_ext.fft ins(%c2, %cst, %cst_0 : index, tensor<2xf32>, tensor<2xf32>) outs(%2, %3 : tensor<32xf32>, tensor<32xf32>) : tensor<32xf32>, tensor<32xf32> + flow.dispatch.tensor.store %4#0, %0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor> + flow.dispatch.tensor.store %4#1, %1, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -79,22 +92,27 @@ module { // CHECK-SAME: lowering_config = #[[CONFIG]] // ----- -module { - func.func @static_3d_fft_stage3() { - %c0 = arith.constant 0 : index - %c3 = arith.constant 3 : index - %c64 = arith.constant 64 : index - %c128 = arith.constant 128 : index - %c32 = arith.constant 32 : index - %cst = arith.constant dense<[1.000000e+00, 0.707106769, 6.12323426E-17, -0.707106769]> : tensor<4xf32> - %cst_0 = arith.constant dense<[-0.000000e+00, -0.707106769, -1.000000e+00, -0.707106769]> : tensor<4xf32> - %0 = bufferization.to_memref %cst_0 : memref<4xf32> - %1 = bufferization.to_memref %cst : memref<4xf32> - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x128x32xf32> - %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<64x128x32xf32> - iree_linalg_ext.fft ins(%c3, %1, %0 : index, memref<4xf32>, memref<4xf32>) outs(%2, %3 : memref<64x128x32xf32>, memref<64x128x32xf32>) - return - } + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @static_3d_fft_stage3() { + %c0 = arith.constant 0 : index + %c3 = arith.constant 3 : index + %c64 = arith.constant 64 : index + %c128 = arith.constant 128 : index + %c32 = arith.constant 32 : index + %cst = arith.constant dense<[1.000000e+00, 0.707106769, 6.12323426E-17, -0.707106769]> : tensor<4xf32> + %cst_0 = arith.constant dense<[-0.000000e+00, -0.707106769, -1.000000e+00, -0.707106769]> : tensor<4xf32> + %0 = bufferization.to_memref %cst_0 : memref<4xf32> + %1 = bufferization.to_memref %cst : memref<4xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<64x128x32xf32> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<64x128x32xf32> + iree_linalg_ext.fft ins(%c3, %1, %0 : index, memref<4xf32>, memref<4xf32>) outs(%2, %3 : memref<64x128x32xf32>, memref<64x128x32xf32>) + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -105,17 +123,22 @@ module { // CHECK-SAME: lowering_config = #[[CONFIG]] // ----- -module { - func.func @winograd_input_transform() { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 34, 34, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x34x34x128xf16> - %3 = tensor.empty() : tensor<8x8x2x6x6x128xf16> - %4 = iree_linalg_ext.winograd.input_transform output_tile_size(6) kernel_size(3) image_dimensions([1, 2]) ins(%2 : tensor<2x34x34x128xf16>) outs(%3 : tensor<8x8x2x6x6x128xf16>) -> tensor<8x8x2x6x6x128xf16> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 8, 2, 6, 6, 128], strides = [1, 1, 1, 1, 1, 1] : tensor<8x8x2x6x6x128xf16> -> !flow.dispatch.tensor> - return - } + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @winograd_input_transform() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 34, 34, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x34x34x128xf16> + %3 = tensor.empty() : tensor<8x8x2x6x6x128xf16> + %4 = iree_linalg_ext.winograd.input_transform output_tile_size(6) kernel_size(3) image_dimensions([1, 2]) ins(%2 : tensor<2x34x34x128xf16>) outs(%3 : tensor<8x8x2x6x6x128xf16>) -> tensor<8x8x2x6x6x128xf16> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 8, 2, 6, 6, 128], strides = [1, 1, 1, 1, 1, 1] : tensor<8x8x2x6x6x128xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -126,17 +149,22 @@ module { // CHECK-SAME: lowering_config = #[[CONFIG]] // ----- -module { - func.func @winograd_output_transform() { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 8, 2, 6, 6, 128], strides = [1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x8x2x6x6x128xf16> - %3 = tensor.empty() : tensor<2x36x36x128xf16> - %4 = iree_linalg_ext.winograd.output_transform output_tile_size(6) kernel_size(3) image_dimensions([1, 2]) ins(%2 : tensor<8x8x2x6x6x128xf16>) outs(%3 : tensor<2x36x36x128xf16>) -> tensor<2x36x36x128xf16> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [2, 36, 36, 128], strides = [1, 1, 1, 1] : tensor<2x36x36x128xf16> -> !flow.dispatch.tensor> - return - } + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> +func.func @winograd_output_transform() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0, 0, 0], sizes = [8, 8, 2, 6, 6, 128], strides = [1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x8x2x6x6x128xf16> + %3 = tensor.empty() : tensor<2x36x36x128xf16> + %4 = iree_linalg_ext.winograd.output_transform output_tile_size(6) kernel_size(3) image_dimensions([1, 2]) ins(%2 : tensor<8x8x2x6x6x128xf16>) outs(%3 : tensor<2x36x36x128xf16>) -> tensor<2x36x36x128xf16> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [2, 36, 36, 128], strides = [1, 1, 1, 1] : tensor<2x36x36x128xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir index e579c778db55e..e02c07d52c3c1 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_linalg_ops.mlir @@ -1,5 +1,11 @@ // RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target> }> #map = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @copy_as_generic() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref{%0, %1} - %3 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref{%0, %1} - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : memref) outs(%3 : memref) { - ^bb0(%in: i32, %out: i32): - linalg.yield %in : i32 - } - return +func.func @copy_as_generic() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref{%0, %1} + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref{%0, %1} + linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : memref) outs(%3 : memref) { + ^bb0(%in: i32, %out: i32): + linalg.yield %in : i32 } + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -30,6 +34,12 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target> }> #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -module { - func.func @copy() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %c224 = arith.constant 224 : index - %c3 = arith.constant 3 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1x224x224x3xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<1x224x224x3xf32> - linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : memref<1x224x224x3xf32>) outs(%1 : memref<1x224x224x3xf32>) { - ^bb0(%in: f32, %out: f32): - linalg.yield %in : f32 - } - return +func.func @copy() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %c224 = arith.constant 224 : index + %c3 = arith.constant 3 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<1x224x224x3xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<1x224x224x3xf32> + linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0 : memref<1x224x224x3xf32>) outs(%1 : memref<1x224x224x3xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 } + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -63,28 +71,32 @@ module { // Average pooling op with nice tilable input. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target> }> -module { - func.func @avg_pool() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %c2 = arith.constant 2 : index - %c8 = arith.constant 8 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = tensor.empty() : tensor<12x12xf32> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 24, 24, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x24x24x8xf32> - %4 = tensor.empty() : tensor<1x2x2x8xf32> - %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x2x2x8xf32>) -> tensor<1x2x2x8xf32> - %6 = linalg.pooling_nhwc_sum {dilations = dense<1> : vector<2xi64>, strides = dense<12> : vector<2xi64>} ins(%3, %2 : tensor<1x24x24x8xf32>, tensor<12x12xf32>) outs(%5 : tensor<1x2x2x8xf32>) -> tensor<1x2x2x8xf32> - flow.dispatch.tensor.store %6, %1, offsets = [0, 0, 0, 0], sizes = [1, 2, 2, 8], strides = [1, 1, 1, 1] : tensor<1x2x2x8xf32> -> !flow.dispatch.tensor> - return - } +func.func @avg_pool() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = tensor.empty() : tensor<12x12xf32> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 24, 24, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x24x24x8xf32> + %4 = tensor.empty() : tensor<1x2x2x8xf32> + %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x2x2x8xf32>) -> tensor<1x2x2x8xf32> + %6 = linalg.pooling_nhwc_sum {dilations = dense<1> : vector<2xi64>, strides = dense<12> : vector<2xi64>} ins(%3, %2 : tensor<1x24x24x8xf32>, tensor<12x12xf32>) outs(%5 : tensor<1x2x2x8xf32>) -> tensor<1x2x2x8xf32> + flow.dispatch.tensor.store %6, %1, offsets = [0, 0, 0, 0], sizes = [1, 2, 2, 8], strides = [1, 1, 1, 1] : tensor<1x2x2x8xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -96,6 +108,12 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target> }> #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -module { - func.func @avg_pool() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %cst = arith.constant 0.000000e+00 : f32 - %cst_0 = arith.constant 4.900000e+01 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 7, 7, 1280], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x7x7x1280xf32> - %3 = tensor.empty() : tensor<7x7xf32> - %4 = tensor.empty() : tensor<1x1x1x1280xf32> - %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x1x1x1280xf32>) -> tensor<1x1x1x1280xf32> - %6 = linalg.pooling_nhwc_sum {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%2, %3 : tensor<1x7x7x1280xf32>, tensor<7x7xf32>) outs(%5 : tensor<1x1x1x1280xf32>) -> tensor<1x1x1x1280xf32> - %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<1x1x1x1280xf32>) outs(%4 : tensor<1x1x1x1280xf32>) { - ^bb0(%in: f32, %out: f32): - %8 = arith.divf %in, %cst_0 : f32 - linalg.yield %8 : f32 - } -> tensor<1x1x1x1280xf32> - flow.dispatch.tensor.store %7, %1, offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 1280], strides = [1, 1, 1, 1] : tensor<1x1x1x1280xf32> -> !flow.dispatch.tensor> - return - } +func.func @avg_pool() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 4.900000e+01 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 7, 7, 1280], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x7x7x1280xf32> + %3 = tensor.empty() : tensor<7x7xf32> + %4 = tensor.empty() : tensor<1x1x1x1280xf32> + %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x1x1x1280xf32>) -> tensor<1x1x1x1280xf32> + %6 = linalg.pooling_nhwc_sum {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%2, %3 : tensor<1x7x7x1280xf32>, tensor<7x7xf32>) outs(%5 : tensor<1x1x1x1280xf32>) -> tensor<1x1x1x1280xf32> + %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<1x1x1x1280xf32>) outs(%4 : tensor<1x1x1x1280xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = arith.divf %in, %cst_0 : f32 + linalg.yield %8 : f32 + } -> tensor<1x1x1x1280xf32> + flow.dispatch.tensor.store %7, %1, offsets = [0, 0, 0, 0], sizes = [1, 1, 1, 1280], strides = [1, 1, 1, 1] : tensor<1x1x1x1280xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -136,29 +152,33 @@ module { // Max pooling op with odd size-1 dimension sizes. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target> }> -module { - func.func @max_pool() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %cst = arith.constant 0xFF800000 : f32 - %c38 = arith.constant 38 : index - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %c320 = arith.constant 320 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = tensor.empty() : tensor<2x1xf32> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 76, 1, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x76x1x1xf32> - %4 = tensor.empty() : tensor<1x38x1x1xf32> - %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x38x1x1xf32>) -> tensor<1x38x1x1xf32> - %6 = linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<[2, 1]> : vector<2xi64>} ins(%3, %2 : tensor<1x76x1x1xf32>, tensor<2x1xf32>) outs(%5 : tensor<1x38x1x1xf32>) -> tensor<1x38x1x1xf32> - flow.dispatch.tensor.store %6, %1, offsets = [0, 0, 0, 0], sizes = [1, 38, 1, 1], strides = [1, 1, 1, 1] : tensor<1x38x1x1xf32> -> !flow.dispatch.tensor> - return - } +func.func @max_pool() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %cst = arith.constant 0xFF800000 : f32 + %c38 = arith.constant 38 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c320 = arith.constant 320 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = tensor.empty() : tensor<2x1xf32> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 76, 1, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x76x1x1xf32> + %4 = tensor.empty() : tensor<1x38x1x1xf32> + %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<1x38x1x1xf32>) -> tensor<1x38x1x1xf32> + %6 = linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<[2, 1]> : vector<2xi64>} ins(%3, %2 : tensor<1x76x1x1xf32>, tensor<2x1xf32>) outs(%5 : tensor<1x38x1x1xf32>) -> tensor<1x38x1x1xf32> + flow.dispatch.tensor.store %6, %1, offsets = [0, 0, 0, 0], sizes = [1, 38, 1, 1], strides = [1, 1, 1, 1] : tensor<1x38x1x1xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -172,6 +192,13 @@ module { // Element wise op with mismatched input and output rank. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d1)> -module { - func.func @elementwise() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c10 = arith.constant 10 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x10xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor> -> tensor<10xf32> - %5 = tensor.empty() : tensor<10xf32> - %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%3, %4 : tensor<1x10xf32>, tensor<10xf32>) outs(%5 : tensor<10xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %7 = arith.addf %in, %in_0 : f32 - linalg.yield %7 : f32 - } -> tensor<10xf32> - flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [10], strides = [1] : tensor<10xf32> -> !flow.dispatch.tensor> - return - } +func.func @elementwise() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c10 = arith.constant 10 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 10], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x10xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [10], strides = [1] : !flow.dispatch.tensor> -> tensor<10xf32> + %5 = tensor.empty() : tensor<10xf32> + %6 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%3, %4 : tensor<1x10xf32>, tensor<10xf32>) outs(%5 : tensor<10xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %7 = arith.addf %in, %in_0 : f32 + linalg.yield %7 : f32 + } -> tensor<10xf32> + flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [10], strides = [1] : tensor<10xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -209,6 +234,12 @@ module { // Fused depthwise convolution and element wise ops: don't vectorize with partially active subgroups. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target> }> #map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)> -module { - func.func @dwconv_elementwise() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %cst = arith.constant dense_resource<__elided__> : tensor<3x3x1x4xf32> - %cst_0 = arith.constant 1.001000e+00 : f32 - %cst_1 = arith.constant 0.000000e+00 : f32 - %c18 = arith.constant 18 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %c4576 = arith.constant 4576 : index - %c6272 = arith.constant 6272 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = tensor.empty() : tensor<1x19x18x1x4xf32> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 21, 20, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x21x20x1xf32> - %4 = tensor.empty() : tensor<1x19x18x1x4xf32> - %5 = linalg.fill ins(%cst_1 : f32) outs(%4 : tensor<1x19x18x1x4xf32>) -> tensor<1x19x18x1x4xf32> - %6 = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %cst : tensor<1x21x20x1xf32>, tensor<3x3x1x4xf32>) outs(%5 : tensor<1x19x18x1x4xf32>) -> tensor<1x19x18x1x4xf32> - %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<1x19x18x1x4xf32>) outs(%2 : tensor<1x19x18x1x4xf32>) { - ^bb0(%in: f32, %out: f32): - %8 = math.sqrt %cst_0 : f32 - %9 = arith.addf %in, %cst_1 : f32 - linalg.yield %9 : f32 - } -> tensor<1x19x18x1x4xf32> - flow.dispatch.tensor.store %7, %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 19, 18, 1, 4], strides = [1, 1, 1, 1, 1] : tensor<1x19x18x1x4xf32> -> !flow.dispatch.tensor> - return - } +func.func @dwconv_elementwise() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %cst = arith.constant dense_resource<__elided__> : tensor<3x3x1x4xf32> + %cst_0 = arith.constant 1.001000e+00 : f32 + %cst_1 = arith.constant 0.000000e+00 : f32 + %c18 = arith.constant 18 : index + %c1 = arith.constant 1 : index + %c4 = arith.constant 4 : index + %c4576 = arith.constant 4576 : index + %c6272 = arith.constant 6272 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = tensor.empty() : tensor<1x19x18x1x4xf32> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 21, 20, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x21x20x1xf32> + %4 = tensor.empty() : tensor<1x19x18x1x4xf32> + %5 = linalg.fill ins(%cst_1 : f32) outs(%4 : tensor<1x19x18x1x4xf32>) -> tensor<1x19x18x1x4xf32> + %6 = linalg.depthwise_conv_2d_nhwc_hwcm {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%3, %cst : tensor<1x21x20x1xf32>, tensor<3x3x1x4xf32>) outs(%5 : tensor<1x19x18x1x4xf32>) -> tensor<1x19x18x1x4xf32> + %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<1x19x18x1x4xf32>) outs(%2 : tensor<1x19x18x1x4xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = math.sqrt %cst_0 : f32 + %9 = arith.addf %in, %cst_1 : f32 + linalg.yield %9 : f32 + } -> tensor<1x19x18x1x4xf32> + flow.dispatch.tensor.store %7, %1, offsets = [0, 0, 0, 0, 0], sizes = [1, 19, 18, 1, 4], strides = [1, 1, 1, 1, 1] : tensor<1x19x18x1x4xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -253,6 +282,12 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #map = affine_map<(d0, d1, d2) -> (d2, d0, d1)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> -module { - func.func @outermost_reduction() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 2048, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x2048x512xf32> - %3 = tensor.empty() : tensor<2048x512xf32> - %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> - %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<4x2048x512xf32>) outs(%4 : tensor<2048x512xf32>) { - ^bb0(%in: f32, %out: f32): - %6 = arith.addf %in, %out : f32 - linalg.yield %6 : f32 - } -> tensor<2048x512xf32> - flow.dispatch.tensor.store %5, %1, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor> - return - } +func.func @outermost_reduction() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 2048, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x2048x512xf32> + %3 = tensor.empty() : tensor<2048x512xf32> + %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32> + %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<4x2048x512xf32>) outs(%4 : tensor<2048x512xf32>) { + ^bb0(%in: f32, %out: f32): + %6 = arith.addf %in, %out : f32 + linalg.yield %6 : f32 + } -> tensor<2048x512xf32> + flow.dispatch.tensor.store %5, %1, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -287,9 +320,14 @@ module { // CHECK: linalg.generic // CHECK-SAME: lowering_config = #[[$CONFIG]] - // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> -module { - func.func @innermost_reduction() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %cst = arith.constant -0.000000e+00 : f32 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = arith.index_cast %0 {stream.alignment = 512 : index, stream.values = [0 : index, 394752 : index, 984064 : index]} : i32 to index - %4 = arith.index_cast %1 {stream.alignment = 512 : index, stream.values = [0 : index, 196608 : index, 197120 : index]} : i32 to index - %5 = arith.index_cast %2 {stream.alignment = 512 : index, stream.values = [512 : index, 197120 : index, 197632 : index]} : i32 to index - %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%3) : !flow.dispatch.tensor> - %7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%4) : !flow.dispatch.tensor> - %8 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) : !flow.dispatch.tensor> - %9 = flow.dispatch.tensor.load %6, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x384xf32> - %10 = flow.dispatch.tensor.load %7, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor> -> tensor<128xf32> - %11 = tensor.empty() : tensor<128xf32> - %12 = linalg.fill ins(%cst : f32) outs(%11 : tensor<128xf32>) -> tensor<128xf32> - %13 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "reduction"]} ins(%9, %10 : tensor<128x384xf32>, tensor<128xf32>) outs(%12 : tensor<128xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %14 = arith.subf %in, %in_0 : f32 - %15 = arith.mulf %14, %14 : f32 - %16 = arith.addf %15, %out : f32 - linalg.yield %16 : f32 - } -> tensor<128xf32> - flow.dispatch.tensor.store %13, %8, offsets = [0], sizes = [128], strides = [1] : tensor<128xf32> -> !flow.dispatch.tensor> - return - } +func.func @innermost_reduction() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %cst = arith.constant -0.000000e+00 : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = arith.index_cast %0 {stream.alignment = 512 : index, stream.values = [0 : index, 394752 : index, 984064 : index]} : i32 to index + %4 = arith.index_cast %1 {stream.alignment = 512 : index, stream.values = [0 : index, 196608 : index, 197120 : index]} : i32 to index + %5 = arith.index_cast %2 {stream.alignment = 512 : index, stream.values = [512 : index, 197120 : index, 197632 : index]} : i32 to index + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%3) : !flow.dispatch.tensor> + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%4) : !flow.dispatch.tensor> + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%5) : !flow.dispatch.tensor> + %9 = flow.dispatch.tensor.load %6, offsets = [0, 0], sizes = [128, 384], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x384xf32> + %10 = flow.dispatch.tensor.load %7, offsets = [0], sizes = [128], strides = [1] : !flow.dispatch.tensor> -> tensor<128xf32> + %11 = tensor.empty() : tensor<128xf32> + %12 = linalg.fill ins(%cst : f32) outs(%11 : tensor<128xf32>) -> tensor<128xf32> + %13 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "reduction"]} ins(%9, %10 : tensor<128x384xf32>, tensor<128xf32>) outs(%12 : tensor<128xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %14 = arith.subf %in, %in_0 : f32 + %15 = arith.mulf %14, %14 : f32 + %16 = arith.addf %15, %out : f32 + linalg.yield %16 : f32 + } -> tensor<128xf32> + flow.dispatch.tensor.store %13, %8, offsets = [0], sizes = [128], strides = [1] : tensor<128xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -335,6 +371,12 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #map = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1, d2)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -module { - func.func @four_dim_elementwise() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [128, 8, 256, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x8x256x4xf32> - %3 = tensor.empty() : tensor<128x256x4x8xf32> - %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<128x8x256x4xf32>) outs(%3 : tensor<128x256x4x8xf32>) { - ^bb0(%in: f32, %out: f32): - linalg.yield %in : f32 - } -> tensor<128x256x4x8xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [128, 256, 4, 8], strides = [1, 1, 1, 1] : tensor<128x256x4x8xf32> -> !flow.dispatch.tensor> - return - } +func.func @four_dim_elementwise() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [128, 8, 256, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x8x256x4xf32> + %3 = tensor.empty() : tensor<128x256x4x8xf32> + %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<128x8x256x4xf32>) outs(%3 : tensor<128x256x4x8xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<128x256x4x8xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [128, 256, 4, 8], strides = [1, 1, 1, 1] : tensor<128x256x4x8xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -368,6 +408,12 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> -module { - func.func @odd_reduction_dimension_size_501() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0xFF800000 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 501], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x501xf32> - %3 = tensor.empty() : tensor<512x501xf32> - %4 = tensor.empty() : tensor<512xf32> - %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<512xf32>) -> tensor<512xf32> - %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<512x501xf32>) outs(%5 : tensor<512xf32>) { - ^bb0(%in: f32, %out: f32): - %8 = arith.maximumf %out, %in : f32 - linalg.yield %8 : f32 - } -> tensor<512xf32> - %7 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<512x501xf32>, tensor<512xf32>) outs(%3 : tensor<512x501xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %8 = arith.subf %in, %in_0 : f32 - %9 = math.exp %8 : f32 - linalg.yield %9 : f32 - } -> tensor<512x501xf32> - flow.dispatch.tensor.store %7, %1, offsets = [0, 0], sizes = [512, 501], strides = [1, 1] : tensor<512x501xf32> -> !flow.dispatch.tensor> - return - } +func.func @odd_reduction_dimension_size_501() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0xFF800000 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 501], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x501xf32> + %3 = tensor.empty() : tensor<512x501xf32> + %4 = tensor.empty() : tensor<512xf32> + %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<512xf32>) -> tensor<512xf32> + %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<512x501xf32>) outs(%5 : tensor<512xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = arith.maximumf %out, %in : f32 + linalg.yield %8 : f32 + } -> tensor<512xf32> + %7 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<512x501xf32>, tensor<512xf32>) outs(%3 : tensor<512x501xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %8 = arith.subf %in, %in_0 : f32 + %9 = math.exp %8 : f32 + linalg.yield %9 : f32 + } -> tensor<512x501xf32> + flow.dispatch.tensor.store %7, %1, offsets = [0, 0], sizes = [512, 501], strides = [1, 1] : tensor<512x501xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -411,6 +455,12 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> -module { - func.func @odd_reduction_dimension_size_2809() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0xFF800000 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 2809], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x2809xf32> - %3 = tensor.empty() : tensor<512x2809xf32> - %4 = tensor.empty() : tensor<512xf32> - %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<512xf32>) -> tensor<512xf32> - %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<512x2809xf32>) outs(%5 : tensor<512xf32>) { - ^bb0(%in: f32, %out: f32): - %8 = arith.maximumf %out, %in : f32 - linalg.yield %8 : f32 - } -> tensor<512xf32> - %7 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<512x2809xf32>, tensor<512xf32>) outs(%3 : tensor<512x2809xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %8 = arith.subf %in, %in_0 : f32 - %9 = math.exp %8 : f32 - linalg.yield %9 : f32 - } -> tensor<512x2809xf32> - flow.dispatch.tensor.store %7, %1, offsets = [0, 0], sizes = [512, 2809], strides = [1, 1] : tensor<512x2809xf32> -> !flow.dispatch.tensor> - return - } +func.func @odd_reduction_dimension_size_2809() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0xFF800000 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 2809], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x2809xf32> + %3 = tensor.empty() : tensor<512x2809xf32> + %4 = tensor.empty() : tensor<512xf32> + %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<512xf32>) -> tensor<512xf32> + %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<512x2809xf32>) outs(%5 : tensor<512xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = arith.maximumf %out, %in : f32 + linalg.yield %8 : f32 + } -> tensor<512xf32> + %7 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%2, %6 : tensor<512x2809xf32>, tensor<512xf32>) outs(%3 : tensor<512x2809xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %8 = arith.subf %in, %in_0 : f32 + %9 = math.exp %8 : f32 + linalg.yield %9 : f32 + } -> tensor<512x2809xf32> + flow.dispatch.tensor.store %7, %1, offsets = [0, 0], sizes = [512, 2809], strides = [1, 1] : tensor<512x2809xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -454,6 +502,12 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #map = affine_map<(d0, d1, d2, d3) -> ()> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -module { - func.func @broadcast() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %cst = arith.constant 1.000000e-10 : f32 - %cst_0 = arith.constant -1.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor - %3 = tensor.empty() : tensor<2048x1x1x1xf32> - %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor) outs(%3 : tensor<2048x1x1x1xf32>) { - ^bb0(%in: f32, %out: f32): - %5 = arith.maximumf %in, %cst : f32 - %6 = arith.divf %cst_0, %5 : f32 - linalg.yield %6 : f32 - } -> tensor<2048x1x1x1xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [2048, 1, 1, 1], strides = [1, 1, 1, 1] : tensor<2048x1x1x1xf32> -> !flow.dispatch.tensor> - return - } +func.func @broadcast() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %cst = arith.constant 1.000000e-10 : f32 + %cst_0 = arith.constant -1.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor + %3 = tensor.empty() : tensor<2048x1x1x1xf32> + %4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor) outs(%3 : tensor<2048x1x1x1xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = arith.maximumf %in, %cst : f32 + %6 = arith.divf %cst_0, %5 : f32 + linalg.yield %6 : f32 + } -> tensor<2048x1x1x1xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0, 0], sizes = [2048, 1, 1, 1], strides = [1, 1, 1, 1] : tensor<2048x1x1x1xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir index f4a803aa215b4..9ff2c67156bf8 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_matmul.mlir @@ -2,30 +2,35 @@ // Odd K that forbids vectorization. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target> }> -module { - func.func @batch_matmul_1x3x32() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c3 = arith.constant 3 : index - %c1 = arith.constant 1 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 3, 3], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x3x3xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [1, 3, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x3x32xf32> - %5 = tensor.empty() : tensor<1x3x32xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x3x32xf32>) -> tensor<1x3x32xf32> - %7 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<1x3x3xf32>, tensor<1x3x32xf32>) outs(%6 : tensor<1x3x32xf32>) -> tensor<1x3x32xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [1, 3, 32], strides = [1, 1, 1] : tensor<1x3x32xf32> -> !flow.dispatch.tensor> - return - } +func.func @batch_matmul_1x3x32() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 3, 3], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x3x3xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [1, 3, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x3x32xf32> + %5 = tensor.empty() : tensor<1x3x32xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x3x32xf32>) -> tensor<1x3x32xf32> + %7 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<1x3x3xf32>, tensor<1x3x32xf32>) outs(%6 : tensor<1x3x32xf32>) -> tensor<1x3x32xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [1, 3, 32], strides = [1, 1, 1] : tensor<1x3x32xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -39,29 +44,34 @@ module { // 8-bit integers can be vectorized. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target> }> -module { - func.func @matmul_64x16xi8() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %c16 = arith.constant 16 : index - %c64 = arith.constant 64 : index - %c0_i32 = arith.constant 0 : i32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x32xi8> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x16xi8> - %5 = tensor.empty() : tensor<64x16xi32> - %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<64x16xi32>) -> tensor<64x16xi32> - %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<64x32xi8>, tensor<32x16xi8>) outs(%6 : tensor<64x16xi32>) -> tensor<64x16xi32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 16], strides = [1, 1] : tensor<64x16xi32> -> !flow.dispatch.tensor> - return - } +func.func @matmul_64x16xi8() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c0_i32 = arith.constant 0 : i32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x32xi8> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x16xi8> + %5 = tensor.empty() : tensor<64x16xi32> + %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<64x16xi32>) -> tensor<64x16xi32> + %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<64x32xi8>, tensor<32x16xi8>) outs(%6 : tensor<64x16xi32>) -> tensor<64x16xi32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 16], strides = [1, 1] : tensor<64x16xi32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -75,29 +85,34 @@ module { // Vectorize non-32 bit types. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target> }> -module { - func.func @matmul_64x16xi64() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %c16 = arith.constant 16 : index - %c64 = arith.constant 64 : index - %c0_i32 = arith.constant 0 : i32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x32xi64> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x16xi64> - %5 = tensor.empty() : tensor<64x16xi64> - %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<64x16xi64>) -> tensor<64x16xi64> - %7 = linalg.matmul ins(%3, %4 : tensor<64x32xi64>, tensor<32x16xi64>) outs(%6 : tensor<64x16xi64>) -> tensor<64x16xi64> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 16], strides = [1, 1] : tensor<64x16xi64> -> !flow.dispatch.tensor> - return - } +func.func @matmul_64x16xi64() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c64 = arith.constant 64 : index + %c0_i32 = arith.constant 0 : i32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x32xi64> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 16], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x16xi64> + %5 = tensor.empty() : tensor<64x16xi64> + %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<64x16xi64>) -> tensor<64x16xi64> + %7 = linalg.matmul ins(%3, %4 : tensor<64x32xi64>, tensor<32x16xi64>) outs(%6 : tensor<64x16xi64>) -> tensor<64x16xi64> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 16], strides = [1, 1] : tensor<64x16xi64> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -111,6 +126,13 @@ module { // Odd N that forbids vectorization. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #map = affine_map<(d0, d1) -> (d1)> #map1 = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @matmul_400x273() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %c11775744 = arith.constant 11775744 : index - %cst = arith.constant 0.000000e+00 : f32 - %c400 = arith.constant 400 : index - %c273 = arith.constant 273 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c11775744) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [273], strides = [1] : !flow.dispatch.tensor> -> tensor<273xf32> - %5 = tensor.empty() : tensor<400x273xf32> - %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [400, 576], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<400x576xf32> - %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [576, 273], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<576x273xf32> - %8 = tensor.empty() : tensor<400x273xf32> - %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<400x273xf32>) -> tensor<400x273xf32> - %10 = linalg.matmul ins(%6, %7 : tensor<400x576xf32>, tensor<576x273xf32>) outs(%9 : tensor<400x273xf32>) -> tensor<400x273xf32> - %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%4, %10 : tensor<273xf32>, tensor<400x273xf32>) outs(%5 : tensor<400x273xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %12 = arith.addf %in, %in_0 : f32 - linalg.yield %12 : f32 - } -> tensor<400x273xf32> - flow.dispatch.tensor.store %11, %3, offsets = [0, 0], sizes = [400, 273], strides = [1, 1] : tensor<400x273xf32> -> !flow.dispatch.tensor> - return - } +func.func @matmul_400x273() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %c11775744 = arith.constant 11775744 : index + %cst = arith.constant 0.000000e+00 : f32 + %c400 = arith.constant 400 : index + %c273 = arith.constant 273 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%c11775744) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [273], strides = [1] : !flow.dispatch.tensor> -> tensor<273xf32> + %5 = tensor.empty() : tensor<400x273xf32> + %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [400, 576], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<400x576xf32> + %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [576, 273], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<576x273xf32> + %8 = tensor.empty() : tensor<400x273xf32> + %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<400x273xf32>) -> tensor<400x273xf32> + %10 = linalg.matmul ins(%6, %7 : tensor<400x576xf32>, tensor<576x273xf32>) outs(%9 : tensor<400x273xf32>) -> tensor<400x273xf32> + %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%4, %10 : tensor<273xf32>, tensor<400x273xf32>) outs(%5 : tensor<400x273xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %12 = arith.addf %in, %in_0 : f32 + linalg.yield %12 : f32 + } -> tensor<400x273xf32> + flow.dispatch.tensor.store %11, %3, offsets = [0, 0], sizes = [400, 273], strides = [1, 1] : tensor<400x273xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -158,6 +178,13 @@ module { // Odd M and non-4-multiplier N +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #map = affine_map<(d0, d1) -> (d1)> #map1 = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @matmul_25x546() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %c15842560 = arith.constant 15842560 : index - %cst = arith.constant 0.000000e+00 : f32 - %c25 = arith.constant 25 : index - %c546 = arith.constant 546 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c15842560) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [546], strides = [1] : !flow.dispatch.tensor> -> tensor<546xf32> - %5 = tensor.empty() : tensor<25x546xf32> - %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [25, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<25x512xf32> - %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [512, 546], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x546xf32> - %8 = tensor.empty() : tensor<25x546xf32> - %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<25x546xf32>) -> tensor<25x546xf32> - %10 = linalg.matmul ins(%6, %7 : tensor<25x512xf32>, tensor<512x546xf32>) outs(%9 : tensor<25x546xf32>) -> tensor<25x546xf32> - %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%4, %10 : tensor<546xf32>, tensor<25x546xf32>) outs(%5 : tensor<25x546xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %12 = arith.addf %in, %in_0 : f32 - linalg.yield %12 : f32 - } -> tensor<25x546xf32> - flow.dispatch.tensor.store %11, %3, offsets = [0, 0], sizes = [25, 546], strides = [1, 1] : tensor<25x546xf32> -> !flow.dispatch.tensor> - return - } +func.func @matmul_25x546() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %c15842560 = arith.constant 15842560 : index + %cst = arith.constant 0.000000e+00 : f32 + %c25 = arith.constant 25 : index + %c546 = arith.constant 546 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%c15842560) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [546], strides = [1] : !flow.dispatch.tensor> -> tensor<546xf32> + %5 = tensor.empty() : tensor<25x546xf32> + %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [25, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<25x512xf32> + %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [512, 546], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x546xf32> + %8 = tensor.empty() : tensor<25x546xf32> + %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<25x546xf32>) -> tensor<25x546xf32> + %10 = linalg.matmul ins(%6, %7 : tensor<25x512xf32>, tensor<512x546xf32>) outs(%9 : tensor<25x546xf32>) -> tensor<25x546xf32> + %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%4, %10 : tensor<546xf32>, tensor<25x546xf32>) outs(%5 : tensor<25x546xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %12 = arith.addf %in, %in_0 : f32 + linalg.yield %12 : f32 + } -> tensor<25x546xf32> + flow.dispatch.tensor.store %11, %3, offsets = [0, 0], sizes = [25, 546], strides = [1, 1] : tensor<25x546xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -205,6 +230,15 @@ module { // Matmul with consumer pointwise ops +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target> }> #map = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @matmul_pointwise_256x1024() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %c256 = arith.constant 256 : index - %c1024 = arith.constant 1024 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor> - %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf16> - %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf16> - %7 = tensor.empty() : tensor<256x1024xf16> - %8 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x128xf16> - %9 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x1024xf16> - %10 = tensor.empty() : tensor<256x1024xf16> - %11 = linalg.fill ins(%cst : f16) outs(%10 : tensor<256x1024xf16>) -> tensor<256x1024xf16> - %12 = linalg.matmul ins(%8, %9 : tensor<256x128xf16>, tensor<128x1024xf16>) outs(%11 : tensor<256x1024xf16>) -> tensor<256x1024xf16> - %13 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%12, %5, %6 : tensor<256x1024xf16>, tensor<256x1024xf16>, tensor<256x1024xf16>) outs(%7 : tensor<256x1024xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %14 = arith.divf %in, %in_0 : f16 - %15 = arith.subf %14, %in_1 : f16 - linalg.yield %15 : f16 - } -> tensor<256x1024xf16> - flow.dispatch.tensor.store %13, %4, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : tensor<256x1024xf16> -> !flow.dispatch.tensor> - return - } +func.func @matmul_pointwise_256x1024() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %c256 = arith.constant 256 : index + %c1024 = arith.constant 1024 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) : !flow.dispatch.tensor> + %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf16> + %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf16> + %7 = tensor.empty() : tensor<256x1024xf16> + %8 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x128xf16> + %9 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x1024xf16> + %10 = tensor.empty() : tensor<256x1024xf16> + %11 = linalg.fill ins(%cst : f16) outs(%10 : tensor<256x1024xf16>) -> tensor<256x1024xf16> + %12 = linalg.matmul ins(%8, %9 : tensor<256x128xf16>, tensor<128x1024xf16>) outs(%11 : tensor<256x1024xf16>) -> tensor<256x1024xf16> + %13 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%12, %5, %6 : tensor<256x1024xf16>, tensor<256x1024xf16>, tensor<256x1024xf16>) outs(%7 : tensor<256x1024xf16>) { + ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): + %14 = arith.divf %in, %in_0 : f16 + %15 = arith.subf %14, %in_1 : f16 + linalg.yield %15 : f16 + } -> tensor<256x1024xf16> + flow.dispatch.tensor.store %13, %4, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : tensor<256x1024xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_misc.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_misc.mlir index 2a20a2c958a87..8deaa1191986d 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_misc.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_misc.mlir @@ -1,40 +1,46 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=vp_android_baseline_2022@vulkan --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d1)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> -module { - func.func @complex_view_as_real() { - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor> -> tensor<1xi32> - %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0], sizes = [1, 1, 32, 50, 2], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x1x32x50x2xf32> - %6 = tensor.empty() : tensor<32x50x2xf32> - %extracted = tensor.extract %4[%c0] : tensor<1xi32> - %7 = arith.extsi %extracted : i32 to i64 - %8 = arith.index_cast %7 : i64 to index - %9 = flow.dispatch.tensor.load %1, offsets = [%8, 0], sizes = [1, 50], strides = [1, 1] : !flow.dispatch.tensor>> -> tensor<50xcomplex> - %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%9 : tensor<50xcomplex>) outs(%6 : tensor<32x50x2xf32>) { - ^bb0(%in: complex, %out: f32): - %11 = linalg.index 0 : index - %12 = linalg.index 1 : index - %extracted_0 = tensor.extract %5[%c0, %c0, %11, %12, %c0] : tensor<1x1x32x50x2xf32> - %extracted_1 = tensor.extract %5[%c0, %c0, %11, %12, %c1] : tensor<1x1x32x50x2xf32> - %13 = complex.create %extracted_0, %extracted_1 : complex - %14 = complex.mul %13, %in : complex - %15 = complex.re %14 : complex - %16 = complex.im %14 : complex - %17 = linalg.index 2 : index - %18 = arith.cmpi eq, %17, %c0 : index - %19 = arith.select %18, %15, %16 : f32 - linalg.yield %19 : f32 - } -> tensor<32x50x2xf32> - flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0], sizes = [32, 50, 2], strides = [1, 1, 1] : tensor<32x50x2xf32> -> !flow.dispatch.tensor> - return - } +func.func @complex_view_as_real() { + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor> -> tensor<1xi32> + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0], sizes = [1, 1, 32, 50, 2], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x1x32x50x2xf32> + %6 = tensor.empty() : tensor<32x50x2xf32> + %extracted = tensor.extract %4[%c0] : tensor<1xi32> + %7 = arith.extsi %extracted : i32 to i64 + %8 = arith.index_cast %7 : i64 to index + %9 = flow.dispatch.tensor.load %1, offsets = [%8, 0], sizes = [1, 50], strides = [1, 1] : !flow.dispatch.tensor>> -> tensor<50xcomplex> + %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%9 : tensor<50xcomplex>) outs(%6 : tensor<32x50x2xf32>) { + ^bb0(%in: complex, %out: f32): + %11 = linalg.index 0 : index + %12 = linalg.index 1 : index + %extracted_0 = tensor.extract %5[%c0, %c0, %11, %12, %c0] : tensor<1x1x32x50x2xf32> + %extracted_1 = tensor.extract %5[%c0, %c0, %11, %12, %c1] : tensor<1x1x32x50x2xf32> + %13 = complex.create %extracted_0, %extracted_1 : complex + %14 = complex.mul %13, %in : complex + %15 = complex.re %14 : complex + %16 = complex.im %14 : complex + %17 = linalg.index 2 : index + %18 = arith.cmpi eq, %17, %c0 : index + %19 = arith.select %18, %15, %16 : f32 + linalg.yield %19 : f32 + } -> tensor<32x50x2xf32> + flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0], sizes = [32, 50, 2], strides = [1, 1, 1] : tensor<32x50x2xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_reduction.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_reduction.mlir index 960d0ac55a142..09c4c36f12457 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_reduction.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_reduction.mlir @@ -1,5 +1,11 @@ // RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> -module { - func.func @subgroup_reduce_f32() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2x512xf32> - %3 = tensor.empty() : tensor<2xf32> - %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2xf32>) -> tensor<2xf32> - %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<2x512xf32>) outs(%4 : tensor<2xf32>) { - ^bb0(%in: f32, %out: f32): - %6 = arith.addf %out, %in : f32 - linalg.yield %6 : f32 - } -> tensor<2xf32> - flow.dispatch.tensor.store %5, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !flow.dispatch.tensor> - return - } +func.func @subgroup_reduce_f32() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2x512xf32> + %3 = tensor.empty() : tensor<2xf32> + %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2xf32>) -> tensor<2xf32> + %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<2x512xf32>) outs(%4 : tensor<2xf32>) { + ^bb0(%in: f32, %out: f32): + %6 = arith.addf %out, %in : f32 + linalg.yield %6 : f32 + } -> tensor<2xf32> + flow.dispatch.tensor.store %5, %1, offsets = [0], sizes = [2], strides = [1] : tensor<2xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -36,6 +40,12 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> -module { - func.func @subgroup_reduce_f16() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 4096, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x4096x4096xf16> - %3 = tensor.empty() : tensor<16x4096x4096xf16> - %4 = tensor.empty() : tensor<16x4096xf16> - %5 = linalg.fill ins(%cst : f16) outs(%4 : tensor<16x4096xf16>) -> tensor<16x4096xf16> - %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<16x4096x4096xf16>) outs(%5 : tensor<16x4096xf16>) { - ^bb0(%in: f16, %out: f16): - %8 = arith.addf %in, %out : f16 - linalg.yield %8 : f16 - } -> tensor<16x4096xf16> - %7 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2, %6 : tensor<16x4096x4096xf16>, tensor<16x4096xf16>) outs(%3 : tensor<16x4096x4096xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %8 = arith.divf %in, %in_0 : f16 - linalg.yield %8 : f16 - } -> tensor<16x4096x4096xf16> - flow.dispatch.tensor.store %7, %1, offsets = [0, 0, 0], sizes = [16, 4096, 4096], strides = [1, 1, 1] : tensor<16x4096x4096xf16> -> !flow.dispatch.tensor> - return - } +func.func @subgroup_reduce_f16() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 4096, 4096], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x4096x4096xf16> + %3 = tensor.empty() : tensor<16x4096x4096xf16> + %4 = tensor.empty() : tensor<16x4096xf16> + %5 = linalg.fill ins(%cst : f16) outs(%4 : tensor<16x4096xf16>) -> tensor<16x4096xf16> + %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<16x4096x4096xf16>) outs(%5 : tensor<16x4096xf16>) { + ^bb0(%in: f16, %out: f16): + %8 = arith.addf %in, %out : f16 + linalg.yield %8 : f16 + } -> tensor<16x4096xf16> + %7 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2, %6 : tensor<16x4096x4096xf16>, tensor<16x4096xf16>) outs(%3 : tensor<16x4096x4096xf16>) { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %8 = arith.divf %in, %in_0 : f16 + linalg.yield %8 : f16 + } -> tensor<16x4096x4096xf16> + flow.dispatch.tensor.store %7, %1, offsets = [0, 0, 0], sizes = [16, 4096, 4096], strides = [1, 1, 1] : tensor<16x4096x4096xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -78,6 +86,12 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> -module { - func.func @subgroup_reduce_dynamic() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c32_i64 = arith.constant 32 : i64 - %cst = arith.constant 0.000000e+00 : f32 - %cst_0 = arith.constant 2.000000e+00 : f32 - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = arith.extui %0 : i32 to i64 - %3 = arith.extui %1 : i32 to i64 - %4 = arith.shli %3, %c32_i64 : i64 - %5 = arith.ori %2, %4 : i64 - %6 = arith.index_castui %5 : i64 to index - %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %8 = flow.dispatch.workload.ordinal %6, 0 : index - %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%8} - %10 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [8, %8], strides = [1, 1] : !flow.dispatch.tensor>{%8} -> tensor<8x?xf32> - %11 = tensor.empty() : tensor<8xf32> - %12 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%11 : tensor<8xf32>) -> tensor<8xf32> - %13 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%10 : tensor<8x?xf32>) outs(%12 : tensor<8xf32>) attrs = {lowering_config = #config} { - ^bb0(%in: f32, %out: f32): - %14 = math.powf %in, %cst_0 : f32 - %15 = arith.addf %14, %out : f32 - linalg.yield %15 : f32 - } -> tensor<8xf32> - flow.dispatch.tensor.store %13, %7, offsets = [0], sizes = [8], strides = [1] : tensor<8xf32> -> !flow.dispatch.tensor> - return - } +func.func @subgroup_reduce_dynamic() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c32_i64 = arith.constant 32 : i64 + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 2.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = arith.extui %0 : i32 to i64 + %3 = arith.extui %1 : i32 to i64 + %4 = arith.shli %3, %c32_i64 : i64 + %5 = arith.ori %2, %4 : i64 + %6 = arith.index_castui %5 : i64 to index + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %8 = flow.dispatch.workload.ordinal %6, 0 : index + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%8} + %10 = flow.dispatch.tensor.load %9, offsets = [0, 0], sizes = [8, %8], strides = [1, 1] : !flow.dispatch.tensor>{%8} -> tensor<8x?xf32> + %11 = tensor.empty() : tensor<8xf32> + %12 = linalg.fill {lowering_config = #config} ins(%cst : f32) outs(%11 : tensor<8xf32>) -> tensor<8xf32> + %13 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%10 : tensor<8x?xf32>) outs(%12 : tensor<8xf32>) attrs = {lowering_config = #config} { + ^bb0(%in: f32, %out: f32): + %14 = math.powf %in, %cst_0 : f32 + %15 = arith.addf %14, %out : f32 + linalg.yield %15 : f32 + } -> tensor<8xf32> + flow.dispatch.tensor.store %13, %7, offsets = [0], sizes = [8], strides = [1] : tensor<8xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_sub_byte_types.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_sub_byte_types.mlir index 93988205e8f04..fefcfe06b45ba 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_sub_byte_types.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_sub_byte_types.mlir @@ -1,29 +1,35 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=vp_android_baseline_2022@vulkan --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> -module { - func.func @i4_dequant() { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [131072, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<131072x128xi4> - %5 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor> -> tensor<131072xf32> - %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor> -> tensor<131072xf32> - %7 = tensor.empty() : tensor<131072x128xf32> - %8 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%4, %5, %6 : tensor<131072x128xi4>, tensor<131072xf32>, tensor<131072xf32>) outs(%7 : tensor<131072x128xf32>) { - ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32): - %9 = arith.extui %in : i4 to i32 - %10 = arith.uitofp %9 : i32 to f32 - %11 = arith.subf %10, %in_1 : f32 - %12 = arith.mulf %11, %in_0 : f32 - linalg.yield %12 : f32 - } -> tensor<131072x128xf32> - flow.dispatch.tensor.store %8, %3, offsets = [0, 0], sizes = [131072, 128], strides = [1, 1] : tensor<131072x128xf32> -> !flow.dispatch.tensor> - return - } +func.func @i4_dequant() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [131072, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<131072x128xi4> + %5 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor> -> tensor<131072xf32> + %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor> -> tensor<131072xf32> + %7 = tensor.empty() : tensor<131072x128xf32> + %8 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%4, %5, %6 : tensor<131072x128xi4>, tensor<131072xf32>, tensor<131072xf32>) outs(%7 : tensor<131072x128xf32>) { + ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32): + %9 = arith.extui %in : i4 to i32 + %10 = arith.uitofp %9 : i32 to f32 + %11 = arith.subf %10, %in_1 : f32 + %12 = arith.mulf %11, %in_0 : f32 + linalg.yield %12 : f32 + } -> tensor<131072x128xf32> + flow.dispatch.tensor.store %8, %3, offsets = [0, 0], sizes = [131072, 128], strides = [1, 1] : tensor<131072x128xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_mali_conv.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_mali_conv.mlir index d56d0c3270c3a..8ae533f2a2b11 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_mali_conv.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_mali_conv.mlir @@ -2,23 +2,28 @@ // Conv - large OC - distribute to only one workgroup dimension. -module { - func.func @conv_112x112x512() { - %c0 = arith.constant 0 : index - %c512 = arith.constant 512 : index - %c112 = arith.constant 112 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x225x225x3xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 512], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x3x512xf32> - %5 = tensor.empty() : tensor<1x112x112x512xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32> - %7 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x512xf32>) outs(%6 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 512], strides = [1, 1, 1, 1] : tensor<1x112x112x512xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @conv_112x112x512() { + %c0 = arith.constant 0 : index + %c512 = arith.constant 512 : index + %c112 = arith.constant 112 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x225x225x3xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 512], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x3x512xf32> + %5 = tensor.empty() : tensor<1x112x112x512xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32> + %7 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x512xf32>) outs(%6 : tensor<1x112x112x512xf32>) -> tensor<1x112x112x512xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 512], strides = [1, 1, 1, 1] : tensor<1x112x112x512xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -32,23 +37,28 @@ module { // Conv - medium OC/OW/OH - distribute to two workgroup dimensions. -module { - func.func @conv_112x112x32() { - %c0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c112 = arith.constant 112 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x225x225x3xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x3x32xf32> - %5 = tensor.empty() : tensor<1x112x112x32xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> - %7 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%6 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : tensor<1x112x112x32xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @conv_112x112x32() { + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %c112 = arith.constant 112 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x225x225x3xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x3x32xf32> + %5 = tensor.empty() : tensor<1x112x112x32xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> + %7 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>) outs(%6 : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : tensor<1x112x112x32xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -62,22 +72,27 @@ module { // Conv - small OC/OW/OH - distribute to all three workgroup dimensions. -module { - func.func @conv_16x16x16() { - %c0 = arith.constant 0 : index - %c16 = arith.constant 16 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 33, 33, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x33x33x3xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x3x16xf32> - %5 = tensor.empty() : tensor<1x16x16x16xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32> - %7 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x33x33x3xf32>, tensor<3x3x3x16xf32>) outs(%6 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 16, 16, 16], strides = [1, 1, 1, 1] : tensor<1x16x16x16xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @conv_16x16x16() { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 33, 33, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x33x33x3xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x3x16xf32> + %5 = tensor.empty() : tensor<1x16x16x16xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32> + %7 = linalg.conv_2d_nhwc_hwcf {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x33x33x3xf32>, tensor<3x3x3x16xf32>) outs(%6 : tensor<1x16x16x16xf32>) -> tensor<1x16x16x16xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 16, 16, 16], strides = [1, 1, 1, 1] : tensor<1x16x16x16xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -91,23 +106,28 @@ module { // Depthwise conv - small OC/OW/OH - distribute to all three workgroup dimensions. -module { - func.func @dwconv_28x28x144() { - %c0 = arith.constant 0 : index - %c144 = arith.constant 144 : index - %c28 = arith.constant 28 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [0, 57, 57, 144], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x57x57x144xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 144], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x144xf32> - %5 = tensor.empty() : tensor<1x28x28x144xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32> - %7 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x57x57x144xf32>, tensor<3x3x144xf32>) outs(%6 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 144], strides = [1, 1, 1, 1] : tensor<1x28x28x144xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @dwconv_28x28x144() { + %c0 = arith.constant 0 : index + %c144 = arith.constant 144 : index + %c28 = arith.constant 28 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [0, 57, 57, 144], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x57x57x144xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 144], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x144xf32> + %5 = tensor.empty() : tensor<1x28x28x144xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32> + %7 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x57x57x144xf32>, tensor<3x3x144xf32>) outs(%6 : tensor<1x28x28x144xf32>) -> tensor<1x28x28x144xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 28, 28, 144], strides = [1, 1, 1, 1] : tensor<1x28x28x144xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -121,24 +141,29 @@ module { // Depthwise conv - tiny OC/OW/OH - starving the GPU. -module { - func.func @dwconv_1x2x8() { - %c0 = arith.constant 0 : index - %c8 = arith.constant 8 : index - %c2 = arith.constant 2 : index - %c1 = arith.constant 1 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 3, 5, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x3x5x8xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 8], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x8xf32> - %5 = tensor.empty() : tensor<1x1x2x8xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x1x2x8xf32>) -> tensor<1x1x2x8xf32> - %7 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x3x5x8xf32>, tensor<3x3x8xf32>) outs(%6 : tensor<1x1x2x8xf32>) -> tensor<1x1x2x8xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 1, 2, 8], strides = [1, 1, 1, 1] : tensor<1x1x2x8xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @dwconv_1x2x8() { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 3, 5, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x3x5x8xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [3, 3, 8], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x8xf32> + %5 = tensor.empty() : tensor<1x1x2x8xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x1x2x8xf32>) -> tensor<1x1x2x8xf32> + %7 = linalg.depthwise_conv_2d_nhwc_hwc {__internal_linalg_transform__ = "workgroup", dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4 : tensor<1x3x5x8xf32>, tensor<3x3x8xf32>) outs(%6 : tensor<1x1x2x8xf32>) -> tensor<1x1x2x8xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 1, 2, 8], strides = [1, 1, 1, 1] : tensor<1x1x2x8xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir index f3adbbcff435c..5f30177f1ff18 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_mali_matmul.mlir @@ -2,23 +2,28 @@ // Large matmul that can match the best tiling scheme. -module { - func.func @matmul_1024x2048x512() { - %c0 = arith.constant 0 : index - %c2048 = arith.constant 2048 : index - %c1024 = arith.constant 1024 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x512xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x2048xf32> - %5 = tensor.empty() : tensor<1024x2048xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<1024x512xf32>, tensor<512x2048xf32>) outs(%6 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 2048], strides = [1, 1] : tensor<1024x2048xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_1024x2048x512() { + %c0 = arith.constant 0 : index + %c2048 = arith.constant 2048 : index + %c1024 = arith.constant 1024 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x512xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x2048xf32> + %5 = tensor.empty() : tensor<1024x2048xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<1024x512xf32>, tensor<512x2048xf32>) outs(%6 : tensor<1024x2048xf32>) -> tensor<1024x2048xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 2048], strides = [1, 1] : tensor<1024x2048xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -32,23 +37,28 @@ module { // Small matmul N that can still tile to all threads in a workgroup. -module { - func.func @matmul_3136x24x96() { - %c0 = arith.constant 0 : index - %c24 = arith.constant 24 : index - %c3136 = arith.constant 3136 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [3136, 96], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<3136x96xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [96, 24], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<96x24xf32> - %5 = tensor.empty() : tensor<3136x24xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<3136x24xf32>) -> tensor<3136x24xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<3136x96xf32>, tensor<96x24xf32>) outs(%6 : tensor<3136x24xf32>) -> tensor<3136x24xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [3136, 24], strides = [1, 1] : tensor<3136x24xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_3136x24x96() { + %c0 = arith.constant 0 : index + %c24 = arith.constant 24 : index + %c3136 = arith.constant 3136 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [3136, 96], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<3136x96xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [96, 24], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<96x24xf32> + %5 = tensor.empty() : tensor<3136x24xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<3136x24xf32>) -> tensor<3136x24xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<3136x96xf32>, tensor<96x24xf32>) outs(%6 : tensor<3136x24xf32>) -> tensor<3136x24xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [3136, 24], strides = [1, 1] : tensor<3136x24xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -62,23 +72,28 @@ module { // Small matmul M that can still tile to all threads in a workgroup. -module { - func.func @matmul_196x64x192() { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c196 = arith.constant 196 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [196, 192], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<196x192xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [192, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<192x64xf32> - %5 = tensor.empty() : tensor<196x64xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<196x64xf32>) -> tensor<196x64xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<196x192xf32>, tensor<192x64xf32>) outs(%6 : tensor<196x64xf32>) -> tensor<196x64xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [196, 64], strides = [1, 1] : tensor<196x64xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_196x64x192() { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c196 = arith.constant 196 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [196, 192], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<196x192xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [192, 64], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<192x64xf32> + %5 = tensor.empty() : tensor<196x64xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<196x64xf32>) -> tensor<196x64xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<196x192xf32>, tensor<192x64xf32>) outs(%6 : tensor<196x64xf32>) -> tensor<196x64xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [196, 64], strides = [1, 1] : tensor<196x64xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -92,19 +107,24 @@ module { // Small matmul K that can still tile to all threads in a workgroup. -module { - func.func @matmul_12544x96x16() { - %c0 = arith.constant 0 : index - %c96 = arith.constant 96 : index - %c12544 = arith.constant 12544 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<12544x16xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x96xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<12544x96xf32> - linalg.fill ins(%cst : f32) outs(%2 : memref<12544x96xf32>) - linalg.matmul ins(%0, %1 : memref<12544x16xf32>, memref<16x96xf32>) outs(%2 : memref<12544x96xf32>) - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_12544x96x16() { + %c0 = arith.constant 0 : index + %c96 = arith.constant 96 : index + %c12544 = arith.constant 12544 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<12544x16xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<16x96xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<12544x96xf32> + linalg.fill ins(%cst : f32) outs(%2 : memref<12544x96xf32>) + linalg.matmul ins(%0, %1 : memref<12544x16xf32>, memref<16x96xf32>) outs(%2 : memref<12544x96xf32>) + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -118,23 +138,28 @@ module { // Odd matmul M and small N that cannot utilize all threads in a workgroup. -module { - func.func @matmul_49x160x576() { - %c0 = arith.constant 0 : index - %c160 = arith.constant 160 : index - %c49 = arith.constant 49 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [49, 576], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<49x576xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [576, 160], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<576x160xf32> - %5 = tensor.empty() : tensor<49x160xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<49x160xf32>) -> tensor<49x160xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<49x576xf32>, tensor<576x160xf32>) outs(%6 : tensor<49x160xf32>) -> tensor<49x160xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [49, 160], strides = [1, 1] : tensor<49x160xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_49x160x576() { + %c0 = arith.constant 0 : index + %c160 = arith.constant 160 : index + %c49 = arith.constant 49 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [49, 576], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<49x576xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [576, 160], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<576x160xf32> + %5 = tensor.empty() : tensor<49x160xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<49x160xf32>) -> tensor<49x160xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<49x576xf32>, tensor<576x160xf32>) outs(%6 : tensor<49x160xf32>) -> tensor<49x160xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [49, 160], strides = [1, 1] : tensor<49x160xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -148,29 +173,34 @@ module { // Small matmul M to "shift" parallelism to N. -module { - func.func @matmul_2x1024x576() { - %cst = arith.constant 0.000000e+00 : f32 - %cst_0 = arith.constant 3.000000e+00 : f32 - %cst_1 = arith.constant 6.000000e+00 : f32 - %cst_2 = arith.constant 0.166666672 : f32 - %c0 = arith.constant 0 : index - %c3436864 = arith.constant 3436864 : index - %c10141312 = arith.constant 10141312 : index - %c2304 = arith.constant 2304 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c3436864) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c10141312) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 576], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2x576xf32> - %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [576, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<576x1024xf32> - %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2x1024xf32> - %7 = tensor.empty() : tensor<2x1024xf32> - %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1024xf32>) -> tensor<2x1024xf32> - %9 = linalg.matmul ins(%4, %5 : tensor<2x576xf32>, tensor<576x1024xf32>) outs(%8 : tensor<2x1024xf32>) -> tensor<2x1024xf32> - flow.dispatch.tensor.store %9, %3, offsets = [0, 0], sizes = [2, 1024], strides = [1, 1] : tensor<2x1024xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_2x1024x576() { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant 3.000000e+00 : f32 + %cst_1 = arith.constant 6.000000e+00 : f32 + %cst_2 = arith.constant 0.166666672 : f32 + %c0 = arith.constant 0 : index + %c3436864 = arith.constant 3436864 : index + %c10141312 = arith.constant 10141312 : index + %c2304 = arith.constant 2304 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c3436864) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c10141312) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 576], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2x576xf32> + %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [576, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<576x1024xf32> + %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2x1024xf32> + %7 = tensor.empty() : tensor<2x1024xf32> + %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x1024xf32>) -> tensor<2x1024xf32> + %9 = linalg.matmul ins(%4, %5 : tensor<2x576xf32>, tensor<576x1024xf32>) outs(%8 : tensor<2x1024xf32>) -> tensor<2x1024xf32> + flow.dispatch.tensor.store %9, %3, offsets = [0, 0], sizes = [2, 1024], strides = [1, 1] : tensor<2x1024xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -184,43 +214,54 @@ module { // Large matmul with i8 inputs. -module { - func.func @matmul_1024x2048x512xi8() { - %c0 = arith.constant 0 : index - %c2048 = arith.constant 2048 : index - %c1024 = arith.constant 1024 : index - %c0_i32 = arith.constant 0 : i32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x512xi8> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x2048xi8> - %5 = tensor.empty() : tensor<1024x2048xi32> - %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x2048xi32>) -> tensor<1024x2048xi32> - %7 = linalg.matmul ins(%3, %4 : tensor<1024x512xi8>, tensor<512x2048xi8>) outs(%6 : tensor<1024x2048xi32>) -> tensor<1024x2048xi32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 2048], strides = [1, 1] : tensor<1024x2048xi32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_1024x2048x512xi8() { + %c0 = arith.constant 0 : index + %c2048 = arith.constant 2048 : index + %c1024 = arith.constant 1024 : index + %c0_i32 = arith.constant 0 : i32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x512xi8> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x2048xi8> + %5 = tensor.empty() : tensor<1024x2048xi32> + %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1024x2048xi32>) -> tensor<1024x2048xi32> + %7 = linalg.matmul ins(%3, %4 : tensor<1024x512xi8>, tensor<512x2048xi8>) outs(%6 : tensor<1024x2048xi32>) -> tensor<1024x2048xi32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 2048], strides = [1, 1] : tensor<1024x2048xi32> -> !flow.dispatch.tensor> + return } // ----- -module { - func.func @batch_matmul_4x384x384() { - %c0 = arith.constant 0 : index - %c384 = arith.constant 384 : index - %c4 = arith.constant 4 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 384, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x384x32xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 384], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x32x384xf32> - %5 = tensor.empty() : tensor<4x384x384xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32> - %7 = linalg.batch_matmul ins(%3, %4 : tensor<4x384x32xf32>, tensor<4x32x384xf32>) outs(%6 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [4, 384, 384], strides = [1, 1, 1] : tensor<4x384x384xf32> -> !flow.dispatch.tensor> - return - } + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @batch_matmul_4x384x384() { + %c0 = arith.constant 0 : index + %c384 = arith.constant 384 : index + %c4 = arith.constant 4 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 384, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x384x32xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 384], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x32x384xf32> + %5 = tensor.empty() : tensor<4x384x384xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32> + %7 = linalg.batch_matmul ins(%3, %4 : tensor<4x384x32xf32>, tensor<4x32x384xf32>) outs(%6 : tensor<4x384x384xf32>) -> tensor<4x384x384xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [4, 384, 384], strides = [1, 1, 1] : tensor<4x384x384xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -234,24 +275,29 @@ module { // Small batch matmul. -module { - func.func @batch_matmul_4x2x8() { - %c0 = arith.constant 0 : index - %c8 = arith.constant 8 : index - %c2 = arith.constant 2 : index - %c4 = arith.constant 4 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 2, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x2x32xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 8], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x32x8xf32> - %5 = tensor.empty() : tensor<4x2x8xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<4x2x8xf32>) -> tensor<4x2x8xf32> - %7 = linalg.batch_matmul ins(%3, %4 : tensor<4x2x32xf32>, tensor<4x32x8xf32>) outs(%6 : tensor<4x2x8xf32>) -> tensor<4x2x8xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [4, 2, 8], strides = [1, 1, 1] : tensor<4x2x8xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @batch_matmul_4x2x8() { + %c0 = arith.constant 0 : index + %c8 = arith.constant 8 : index + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4, 2, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x2x32xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4, 32, 8], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4x32x8xf32> + %5 = tensor.empty() : tensor<4x2x8xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<4x2x8xf32>) -> tensor<4x2x8xf32> + %7 = linalg.batch_matmul ins(%3, %4 : tensor<4x2x32xf32>, tensor<4x32x8xf32>) outs(%6 : tensor<4x2x8xf32>) -> tensor<4x2x8xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [4, 2, 8], strides = [1, 1, 1] : tensor<4x2x8xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -265,32 +311,37 @@ module { // Linalg.generic that is a batch matmul. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> #map3 = affine_map<(d0, d1, d2) -> (d0, d2)> #map4 = affine_map<(d0, d1, d2) -> (d2, d1)> #map5 = affine_map<(d0, d1, d2) -> (d0, d1)> -module { - func.func @generic_batch_matmul_32x2x512() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x32x64xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [32, 64, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<32x64x512xf32> - %5 = tensor.empty() : tensor<32x8x512xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<32x8x512xf32>) -> tensor<32x8x512xf32> - %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<8x32x64xf32>, tensor<32x64x512xf32>) outs(%6 : tensor<32x8x512xf32>) attrs = {linalg.memoized_indexing_maps = [#map3, #map4, #map5]} { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %8 = arith.mulf %in, %in_0 : f32 - %9 = arith.addf %out, %8 : f32 - linalg.yield %9 : f32 - } -> tensor<32x8x512xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [32, 8, 512], strides = [1, 1, 1] : tensor<32x8x512xf32> -> !flow.dispatch.tensor> - return - } +func.func @generic_batch_matmul_32x2x512() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x32x64xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [32, 64, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<32x64x512xf32> + %5 = tensor.empty() : tensor<32x8x512xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<32x8x512xf32>) -> tensor<32x8x512xf32> + %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<8x32x64xf32>, tensor<32x64x512xf32>) outs(%6 : tensor<32x8x512xf32>) attrs = {linalg.memoized_indexing_maps = [#map3, #map4, #map5]} { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %8 = arith.mulf %in, %in_0 : f32 + %9 = arith.addf %out, %8 : f32 + linalg.yield %9 : f32 + } -> tensor<32x8x512xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [32, 8, 512], strides = [1, 1, 1] : tensor<32x8x512xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -304,42 +355,48 @@ module { // Linalg.generic that is a batch matmul. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d3, d2)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> #map3 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> -module { - func.func @generic_batch_matmul_8x2500x512x4608() { - %c168607744 = arith.constant 168607744 : index - %c537247744 = arith.constant 537247744 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c168607744) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c537247744) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [8, 2500, 4608], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x2500x4608xf32> - %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4608, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4608x512xf32> - %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [8, 2500, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x2500x512xf32> - %8 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [8, 2500, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x2500x512xf32> - %9 = tensor.empty() : tensor<8x2500x512xf32> - %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<8x2500x512xf32>) -> tensor<8x2500x512xf32> - %11 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%5, %6 : tensor<8x2500x4608xf32>, tensor<4608x512xf32>) outs(%10 : tensor<8x2500x512xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %13 = arith.mulf %in, %in_0 : f32 - %14 = arith.addf %13, %out : f32 - linalg.yield %14 : f32 - } -> tensor<8x2500x512xf32> - %12 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%11, %7, %8 : tensor<8x2500x512xf32>, tensor<8x2500x512xf32>, tensor<8x2500x512xf32>) outs(%9 : tensor<8x2500x512xf32>) { - ^bb0(%in: f32, %in_0: f32, %in_1: f32, %out: f32): - %13 = arith.addf %in, %in_0 : f32 - %14 = arith.subf %13, %in_1 : f32 - linalg.yield %14 : f32 - } -> tensor<8x2500x512xf32> - flow.dispatch.tensor.store %12, %4, offsets = [0, 0, 0], sizes = [8, 2500, 512], strides = [1, 1, 1] : tensor<8x2500x512xf32> -> !flow.dispatch.tensor> - return - } +func.func @generic_batch_matmul_8x2500x512x4608() { + %c168607744 = arith.constant 168607744 : index + %c537247744 = arith.constant 537247744 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c168607744) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c537247744) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [8, 2500, 4608], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x2500x4608xf32> + %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4608, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4608x512xf32> + %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [8, 2500, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x2500x512xf32> + %8 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [8, 2500, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<8x2500x512xf32> + %9 = tensor.empty() : tensor<8x2500x512xf32> + %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<8x2500x512xf32>) -> tensor<8x2500x512xf32> + %11 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%5, %6 : tensor<8x2500x4608xf32>, tensor<4608x512xf32>) outs(%10 : tensor<8x2500x512xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %13 = arith.mulf %in, %in_0 : f32 + %14 = arith.addf %13, %out : f32 + linalg.yield %14 : f32 + } -> tensor<8x2500x512xf32> + %12 = linalg.generic {indexing_maps = [#map3, #map3, #map3, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%11, %7, %8 : tensor<8x2500x512xf32>, tensor<8x2500x512xf32>, tensor<8x2500x512xf32>) outs(%9 : tensor<8x2500x512xf32>) { + ^bb0(%in: f32, %in_0: f32, %in_1: f32, %out: f32): + %13 = arith.addf %in, %in_0 : f32 + %14 = arith.subf %13, %in_1 : f32 + linalg.yield %14 : f32 + } -> tensor<8x2500x512xf32> + flow.dispatch.tensor.store %12, %4, offsets = [0, 0, 0], sizes = [8, 2500, 512], strides = [1, 1, 1] : tensor<8x2500x512xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul.mlir index fe1e2ad03f40f..4c3f060c91a46 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul.mlir @@ -1,23 +1,28 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=pascal@vulkan --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s -module { - func.func @matmul_4x4096x9216() { - %c36864 = arith.constant 36864 : index - %c667974912 = arith.constant 667974912 : index - %c209920 = arith.constant 209920 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c209920) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c667974912) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c36864) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 9216], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4x9216xf32> - %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [9216, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<9216x4096xf32> - %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4x4096xf32> - %7 = linalg.matmul ins(%4, %5 : tensor<4x9216xf32>, tensor<9216x4096xf32>) outs(%6 : tensor<4x4096xf32>) -> tensor<4x4096xf32> - flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [4, 4096], strides = [1, 1] : tensor<4x4096xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_4x4096x9216() { + %c36864 = arith.constant 36864 : index + %c667974912 = arith.constant 667974912 : index + %c209920 = arith.constant 209920 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c209920) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c667974912) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c36864) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 9216], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4x9216xf32> + %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [9216, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<9216x4096xf32> + %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4x4096xf32> + %7 = linalg.matmul ins(%4, %5 : tensor<4x9216xf32>, tensor<9216x4096xf32>) outs(%6 : tensor<4x4096xf32>) -> tensor<4x4096xf32> + flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [4, 4096], strides = [1, 1] : tensor<4x4096xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -31,24 +36,29 @@ module { // Matvec does not go down matmul pipelines. -module { - func.func @matmul_1x4096x9216() { - %c36864 = arith.constant 36864 : index - %c667974912 = arith.constant 667974912 : index - %c209920 = arith.constant 209920 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c209920) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c667974912) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c36864) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 9216], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x9216xf32> - %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [9216, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<9216x4096xf32> - %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x4096xf32> - %7 = linalg.matmul ins(%4, %5 : tensor<1x9216xf32>, tensor<9216x4096xf32>) outs(%6 : tensor<1x4096xf32>) -> tensor<1x4096xf32> - flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : tensor<1x4096xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_1x4096x9216() { + %c36864 = arith.constant 36864 : index + %c667974912 = arith.constant 667974912 : index + %c209920 = arith.constant 209920 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c209920) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c667974912) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c36864) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 9216], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x9216xf32> + %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [9216, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<9216x4096xf32> + %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x4096xf32> + %7 = linalg.matmul ins(%4, %5 : tensor<1x9216xf32>, tensor<9216x4096xf32>) outs(%6 : tensor<1x4096xf32>) -> tensor<1x4096xf32> + flow.dispatch.tensor.store %7, %3, offsets = [0, 0], sizes = [1, 4096], strides = [1, 1] : tensor<1x4096xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config @@ -62,29 +72,34 @@ module { // Multi-reduction-dimension transposed-B matmul. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> -module { - func.func @multi_reduction_transposed_b_matmul() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x128xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [2048, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2048x86x128xf32> - %5 = tensor.empty() : tensor<4096x2048xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<4096x2048xf32>) -> tensor<4096x2048xf32> - %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %4 : tensor<4096x86x128xf32>, tensor<2048x86x128xf32>) outs(%6 : tensor<4096x2048xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %8 = arith.mulf %in, %in_0 : f32 - %9 = arith.addf %out, %8 : f32 - linalg.yield %9 : f32 - } -> tensor<4096x2048xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [4096, 2048], strides = [1, 1] : tensor<4096x2048xf32> -> !flow.dispatch.tensor> - return - } +func.func @multi_reduction_transposed_b_matmul() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x128xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [2048, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2048x86x128xf32> + %5 = tensor.empty() : tensor<4096x2048xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<4096x2048xf32>) -> tensor<4096x2048xf32> + %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%3, %4 : tensor<4096x86x128xf32>, tensor<2048x86x128xf32>) outs(%6 : tensor<4096x2048xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %8 = arith.mulf %in, %in_0 : f32 + %9 = arith.addf %out, %8 : f32 + linalg.yield %9 : f32 + } -> tensor<4096x2048xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [4096, 2048], strides = [1, 1] : tensor<4096x2048xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul_cooperative_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul_cooperative_ops.mlir index f274f4fe2b412..db27e00cce445 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul_cooperative_ops.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul_cooperative_ops.mlir @@ -2,35 +2,42 @@ // RUN: --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass)' %s | \ // RUN: FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer> + ]> +]> #map = affine_map<(d0, d1) -> (d0, d1)> -module { - func.func @matmul_256x1024x128_div_add() { - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %c256 = arith.constant 256 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor> - %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf16> - %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf16> - %7 = tensor.empty() : tensor<256x1024xf16> - %8 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x128xf16> - %9 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x1024xf16> - %10 = tensor.empty() : tensor<256x1024xf16> - %11 = linalg.fill ins(%cst : f16) outs(%10 : tensor<256x1024xf16>) -> tensor<256x1024xf16> - %12 = linalg.matmul ins(%8, %9 : tensor<256x128xf16>, tensor<128x1024xf16>) outs(%11 : tensor<256x1024xf16>) -> tensor<256x1024xf16> - %13 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%12, %5, %6 : tensor<256x1024xf16>, tensor<256x1024xf16>, tensor<256x1024xf16>) outs(%7 : tensor<256x1024xf16>) { - ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): - %14 = arith.divf %in, %in_0 : f16 - %15 = arith.addf %14, %in_1 : f16 - linalg.yield %15 : f16 - } -> tensor<256x1024xf16> - flow.dispatch.tensor.store %13, %4, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : tensor<256x1024xf16> -> !flow.dispatch.tensor> - return - } +func.func @matmul_256x1024x128_div_add() { + %c0 = arith.constant 0 : index + %c1024 = arith.constant 1024 : index + %c256 = arith.constant 256 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) : !flow.dispatch.tensor> + %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf16> + %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf16> + %7 = tensor.empty() : tensor<256x1024xf16> + %8 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x128xf16> + %9 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x1024xf16> + %10 = tensor.empty() : tensor<256x1024xf16> + %11 = linalg.fill ins(%cst : f16) outs(%10 : tensor<256x1024xf16>) -> tensor<256x1024xf16> + %12 = linalg.matmul ins(%8, %9 : tensor<256x128xf16>, tensor<128x1024xf16>) outs(%11 : tensor<256x1024xf16>) -> tensor<256x1024xf16> + %13 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%12, %5, %6 : tensor<256x1024xf16>, tensor<256x1024xf16>, tensor<256x1024xf16>) outs(%7 : tensor<256x1024xf16>) { + ^bb0(%in: f16, %in_0: f16, %in_1: f16, %out: f16): + %14 = arith.divf %in, %in_0 : f16 + %15 = arith.addf %14, %in_1 : f16 + linalg.yield %15 : f16 + } -> tensor<256x1024xf16> + flow.dispatch.tensor.store %13, %4, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : tensor<256x1024xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -42,29 +49,35 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> -module { - func.func @batch_matmul_16x128x256x512_div() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 128, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x128x512xf16> - %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x512x256xf16> - %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [16, 128, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x128x256xf16> - %7 = tensor.empty() : tensor<16x128x256xf16> - %8 = linalg.fill ins(%cst : f16) outs(%7 : tensor<16x128x256xf16>) -> tensor<16x128x256xf16> - %9 = linalg.batch_matmul ins(%4, %5 : tensor<16x128x512xf16>, tensor<16x512x256xf16>) outs(%8 : tensor<16x128x256xf16>) -> tensor<16x128x256xf16> - %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%9, %6 : tensor<16x128x256xf16>, tensor<16x128x256xf16>) outs(%7 : tensor<16x128x256xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %11 = arith.divf %in, %in_0 : f16 - linalg.yield %11 : f16 - } -> tensor<16x128x256xf16> - flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0], sizes = [16, 128, 256], strides = [1, 1, 1] : tensor<16x128x256xf16> -> !flow.dispatch.tensor> - return - } +func.func @batch_matmul_16x128x256x512_div() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 128, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x128x512xf16> + %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x512x256xf16> + %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [16, 128, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x128x256xf16> + %7 = tensor.empty() : tensor<16x128x256xf16> + %8 = linalg.fill ins(%cst : f16) outs(%7 : tensor<16x128x256xf16>) -> tensor<16x128x256xf16> + %9 = linalg.batch_matmul ins(%4, %5 : tensor<16x128x512xf16>, tensor<16x512x256xf16>) outs(%8 : tensor<16x128x256xf16>) -> tensor<16x128x256xf16> + %10 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%9, %6 : tensor<16x128x256xf16>, tensor<16x128x256xf16>) outs(%7 : tensor<16x128x256xf16>) { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %11 = arith.divf %in, %in_0 : f16 + linalg.yield %11 : f16 + } -> tensor<16x128x256xf16> + flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0], sizes = [16, 128, 256], strides = [1, 1, 1] : tensor<16x128x256xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -78,32 +91,37 @@ module { // Linalg.generic that is a batch matmul. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> #map3 = affine_map<(d0, d1, d2) -> (d0, d2)> #map4 = affine_map<(d0, d1, d2) -> (d2, d1)> #map5 = affine_map<(d0, d1, d2) -> (d0, d1)> -module { - func.func @generic_batch_matmul_32x8x512x64() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x32x64xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [32, 64, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<32x64x512xf16> - %5 = tensor.empty() : tensor<32x128x512xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<32x128x512xf16>) -> tensor<32x128x512xf16> - %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<128x32x64xf16>, tensor<32x64x512xf16>) outs(%6 : tensor<32x128x512xf16>) attrs = {linalg.memoized_indexing_maps = [#map3, #map4, #map5]} { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %8 = arith.mulf %in, %in_0 : f16 - %9 = arith.addf %out, %8 : f16 - linalg.yield %9 : f16 - } -> tensor<32x128x512xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [32, 128, 512], strides = [1, 1, 1] : tensor<32x128x512xf16> -> !flow.dispatch.tensor> - return - } +func.func @generic_batch_matmul_32x8x512x64() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x32x64xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [32, 64, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<32x64x512xf16> + %5 = tensor.empty() : tensor<32x128x512xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<32x128x512xf16>) -> tensor<32x128x512xf16> + %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<128x32x64xf16>, tensor<32x64x512xf16>) outs(%6 : tensor<32x128x512xf16>) attrs = {linalg.memoized_indexing_maps = [#map3, #map4, #map5]} { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %8 = arith.mulf %in, %in_0 : f16 + %9 = arith.addf %out, %8 : f16 + linalg.yield %9 : f16 + } -> tensor<32x128x512xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [32, 128, 512], strides = [1, 1, 1] : tensor<32x128x512xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -117,21 +135,26 @@ module { // K dim size not divisble by 32. -module { - func.func @batch_matmul_16x1024x1024x80() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 1024, 80], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x1024x80xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 80, 1024], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x80x1024xf16> - %5 = tensor.empty() : tensor<16x1024x1024xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<16x1024x1024xf16>) -> tensor<16x1024x1024xf16> - %7 = linalg.batch_matmul ins(%3, %4 : tensor<16x1024x80xf16>, tensor<16x80x1024xf16>) outs(%6 : tensor<16x1024x1024xf16>) -> tensor<16x1024x1024xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [16, 1024, 1024], strides = [1, 1, 1] : tensor<16x1024x1024xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @batch_matmul_16x1024x1024x80() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 1024, 80], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x1024x80xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 80, 1024], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x80x1024xf16> + %5 = tensor.empty() : tensor<16x1024x1024xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<16x1024x1024xf16>) -> tensor<16x1024x1024xf16> + %7 = linalg.batch_matmul ins(%3, %4 : tensor<16x1024x80xf16>, tensor<16x80x1024xf16>) outs(%6 : tensor<16x1024x1024xf16>) -> tensor<16x1024x1024xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [16, 1024, 1024], strides = [1, 1, 1] : tensor<16x1024x1024xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config @@ -145,23 +168,28 @@ module { // Small K - not supported by cooperative matrix. -module { - func.func @matmul_256x1024x8() { - %c0 = arith.constant 0 : index - %c1024 = arith.constant 1024 : index - %c256 = arith.constant 256 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 8], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x8xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<8x1024xf16> - %5 = tensor.empty() : tensor<256x1024xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<256x1024xf16>) -> tensor<256x1024xf16> - %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<256x8xf16>, tensor<8x1024xf16>) outs(%6 : tensor<256x1024xf16>) -> tensor<256x1024xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : tensor<256x1024xf16> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul_256x1024x8() { + %c0 = arith.constant 0 : index + %c1024 = arith.constant 1024 : index + %c256 = arith.constant 256 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 8], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x8xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [8, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<8x1024xf16> + %5 = tensor.empty() : tensor<256x1024xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<256x1024xf16>) -> tensor<256x1024xf16> + %7 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%3, %4 : tensor<256x8xf16>, tensor<8x1024xf16>) outs(%6 : tensor<256x1024xf16>) -> tensor<256x1024xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : tensor<256x1024xf16> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_user.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_user.mlir index ca52b8ac340db..f42b1c1fd02ff 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_user.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_user.mlir @@ -1,25 +1,30 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=vp_android_baseline_2022@vulkan --pass-pipeline='builtin.module(iree-codegen-materialize-user-configs, iree-spirv-select-lowering-strategy-pass)' %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - func.func @matmul_128x1024x256() { - %cst = arith.constant 0.000000e+00 : f32 - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf32> - %5 = tensor.empty() : tensor<128x1024xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x1024xf32>) -> tensor<128x1024xf32> - %7 = linalg.matmul {compilation_info = #compilation} ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%6 : tensor<128x1024xf32>) -> tensor<128x1024xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor> - return - } +func.func @matmul_128x1024x256() { + %cst = arith.constant 0.000000e+00 : f32 + %c128 = arith.constant 128 : index + %c1024 = arith.constant 1024 : index + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf32> + %5 = tensor.empty() : tensor<128x1024xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<128x1024xf32>) -> tensor<128x1024xf32> + %7 = linalg.matmul {compilation_info = #compilation} ins(%3, %4 : tensor<128x256xf32>, tensor<256x1024xf32>) outs(%6 : tensor<128x1024xf32>) -> tensor<128x1024xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 1024], strides = [1, 1] : tensor<128x1024xf32> -> !flow.dispatch.tensor> + return } // CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/convert_to_spirv.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/convert_to_spirv.mlir index 7b80a768717c7..b51836711be08 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/convert_to_spirv.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/convert_to_spirv.mlir @@ -28,7 +28,7 @@ hal.executable private @push_constant { // INDEX64: %[[AC:.+]] = spirv.AccessChain %[[ADDR]][%[[INDEX_0]], %[[INDEX_1]]] : !spirv.ptr [0])>, PushConstant> // INDEX64: %[[LOAD:.+]] = spirv.Load "PushConstant" %[[AC]] : i32 // INDEX64: spirv.UConvert %[[LOAD]] : i32 to i64 - %0 = hal.interface.constant.load[2] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 %1 = arith.index_castui %0 : i32 to index return %1 : index } @@ -65,17 +65,17 @@ hal.executable private @resource_bindings_in_same_func { // Same type // CHECK: spirv.mlir.addressof @[[ARG0]] // CHECK: spirv.mlir.addressof @[[ARG0]] - %0 = hal.interface.binding.subspan set(1) binding(2) type(storage_buffer) : memref<4x4xf32, #spirv.storage_class> - %1 = hal.interface.binding.subspan set(1) binding(2) type(storage_buffer) : memref<4x4xf32, #spirv.storage_class> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(1) binding(2) : memref<4x4xf32, #spirv.storage_class> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(1) binding(2) : memref<4x4xf32, #spirv.storage_class> // Different type // CHECK: spirv.mlir.addressof @[[ARG1_0]] // CHECK: spirv.mlir.addressof @[[ARG1_1]] - %2 = hal.interface.binding.subspan set(1) binding(3) type(storage_buffer) : memref<4x4xf32, #spirv.storage_class> - %3 = hal.interface.binding.subspan set(1) binding(3) type(storage_buffer) : memref<4xvector<4xf32>, #spirv.storage_class> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(1) binding(3) : memref<4x4xf32, #spirv.storage_class> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(1) binding(3) : memref<4xvector<4xf32>, #spirv.storage_class> // CHECK: spirv.mlir.addressof @[[RET0]] - %4 = hal.interface.binding.subspan set(3) binding(4) type(storage_buffer) : memref<4x4xf32, #spirv.storage_class> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(3) binding(4) : memref<4x4xf32, #spirv.storage_class> %5 = memref.load %0[%c0, %c0] : memref<4x4xf32, #spirv.storage_class> %6 = memref.load %1[%c0, %c0] : memref<4x4xf32, #spirv.storage_class> @@ -127,8 +127,8 @@ hal.executable private @resource_bindings_in_multi_entry_func { // CHECK: spirv.mlir.addressof @[[FUNC1_ARG]] // CHECK: spirv.mlir.addressof @[[FUNC1_RET]] %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(1) binding(2) type(storage_buffer) : memref<4x4xf32, #spirv.storage_class> - %1 = hal.interface.binding.subspan set(3) binding(4) type(storage_buffer) : memref<4xvector<4xf32>, #spirv.storage_class> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(1) binding(2) : memref<4x4xf32, #spirv.storage_class> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(3) binding(4) : memref<4xvector<4xf32>, #spirv.storage_class> %2 = memref.load %0[%c0, %c0] : memref<4x4xf32, #spirv.storage_class> %3 = memref.load %1[%c0] : memref<4xvector<4xf32>, #spirv.storage_class> @@ -144,8 +144,8 @@ hal.executable private @resource_bindings_in_multi_entry_func { // CHECK: spirv.mlir.addressof @[[FUNC2_ARG]] // CHECK: spirv.mlir.addressof @[[FUNC2_RET]] %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(1) binding(2) type(storage_buffer) : memref<4x4xf32, #spirv.storage_class> // Same type as previous function - %1 = hal.interface.binding.subspan set(3) binding(4) type(storage_buffer) : memref<4x4xf32, #spirv.storage_class> // Different type as previous function + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(1) binding(2) : memref<4x4xf32, #spirv.storage_class> // Same type as previous function + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(3) binding(4) : memref<4x4xf32, #spirv.storage_class> // Different type as previous function %2 = memref.load %0[%c0, %c0] : memref<4x4xf32, #spirv.storage_class> %3 = memref.load %1[%c0, %c0] : memref<4x4xf32, #spirv.storage_class> @@ -175,9 +175,9 @@ hal.executable private @interface_binding { builtin.module attributes {spirv.target_env = #spirv.target_env<#spirv.vce, #spirv.resource_limits<>>} { func.func @interface_binding() -> f32 { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<8x5xf32, #spirv.storage_class> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<5xf32, #spirv.storage_class> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<8x5xf32, #spirv.storage_class> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<8x5xf32, #spirv.storage_class> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<5xf32, #spirv.storage_class> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<8x5xf32, #spirv.storage_class> %3 = memref.load %0[%c0, %c0] : memref<8x5xf32, #spirv.storage_class> %4 = memref.load %1[%c0] : memref<5xf32, #spirv.storage_class> diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/emulate_i64.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/emulate_i64.mlir index 4a2cf5dfcd79c..a7a2f9d854699 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/emulate_i64.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/emulate_i64.mlir @@ -2,32 +2,37 @@ // RUN: --pass-pipeline='builtin.module(func.func(iree-spirv-emulate-i64))' %s | \ // RUN: FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target> }> -module { - func.func @buffer_types() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %c1_i64 = arith.constant 1 : i64 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<8xi32, #spirv.storage_class> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8xi64, #spirv.storage_class> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<8xi64, #spirv.storage_class> - %3 = memref.load %0[%c0] : memref<8xi32, #spirv.storage_class> - %4 = memref.load %1[%c0] : memref<8xi64, #spirv.storage_class> - %5 = arith.addi %4, %c1_i64 : i64 - memref.store %5, %2[%c0] : memref<8xi64, #spirv.storage_class> - return - } +func.func @buffer_types() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %c1_i64 = arith.constant 1 : i64 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<8xi32, #spirv.storage_class> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<8xi64, #spirv.storage_class> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<8xi64, #spirv.storage_class> + %3 = memref.load %0[%c0] : memref<8xi32, #spirv.storage_class> + %4 = memref.load %1[%c0] : memref<8xi64, #spirv.storage_class> + %5 = arith.addi %4, %c1_i64 : i64 + memref.store %5, %2[%c0] : memref<8xi64, #spirv.storage_class> + return } // Check that without the Int64 capability emulation produces expected i32 ops. // // CHECK-LABEL: func.func @buffer_types -// CHECK: [[REF_I64_0:%.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8xvector<2xi32>, #spirv.storage_class> -// CHECK: [[REF_I64_1:%.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<8xvector<2xi32>, #spirv.storage_class> +// CHECK: [[REF_I64_0:%.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref<8xvector<2xi32>, #spirv.storage_class> +// CHECK: [[REF_I64_1:%.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) : memref<8xvector<2xi32>, #spirv.storage_class> // CHECK: [[VI64:%.+]] = memref.load [[REF_I64_0]][{{%.+}}] : memref<8xvector<2xi32>, #spirv.storage_class> // CHECK: {{%.+}} = arith.addui_extended {{%.+}}, {{%.+}} : i32, i1 // CHECK: memref.store {{%.+}}, [[REF_I64_1]][{{%.+}}] : memref<8xvector<2xi32>, #spirv.storage_class> @@ -35,40 +40,44 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target> }> -module { - func.func @emulate_1d_vector() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c95232 = arith.constant 95232 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - %c36864 = arith.constant 36864 : index - %c1523712 = arith.constant 1523712 : index - %c96 = arith.constant 96 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref, #spirv.storage_class>{%c96} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c1523712) : memref, #spirv.storage_class>{%c36864} - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref, #spirv.storage_class>{%c36864} - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %thread_id_x = gpu.thread_id x - %3 = arith.muli %workgroup_id_x, %c32 : index - %4 = arith.addi %thread_id_x, %3 : index - %5 = memref.load %0[%4] : memref, #spirv.storage_class> - %6 = arith.extsi %5 : vector<4xi32> to vector<4xi64> - %7 = arith.extui %5 : vector<4xi32> to vector<4xi64> - %8 = arith.muli %6, %7 : vector<4xi64> - %9 = arith.addi %6, %8 : vector<4xi64> - %10 = arith.trunci %9 : vector<4xi64> to vector<4xi32> - %11 = arith.muli %workgroup_id_y, %c96 : index - %12 = arith.addi %4, %11 : index - %13 = arith.addi %12, %c95232 : index - memref.store %10, %2[%13] : memref, #spirv.storage_class> - return - } +func.func @emulate_1d_vector() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c95232 = arith.constant 95232 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + %c36864 = arith.constant 36864 : index + %c1523712 = arith.constant 1523712 : index + %c96 = arith.constant 96 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref, #spirv.storage_class>{%c96} + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c1523712) : memref, #spirv.storage_class>{%c36864} + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref, #spirv.storage_class>{%c36864} + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %thread_id_x = gpu.thread_id x + %3 = arith.muli %workgroup_id_x, %c32 : index + %4 = arith.addi %thread_id_x, %3 : index + %5 = memref.load %0[%4] : memref, #spirv.storage_class> + %6 = arith.extsi %5 : vector<4xi32> to vector<4xi64> + %7 = arith.extui %5 : vector<4xi32> to vector<4xi64> + %8 = arith.muli %6, %7 : vector<4xi64> + %9 = arith.addi %6, %8 : vector<4xi64> + %10 = arith.trunci %9 : vector<4xi64> to vector<4xi32> + %11 = arith.muli %workgroup_id_y, %c96 : index + %12 = arith.addi %4, %11 : index + %13 = arith.addi %12, %c95232 : index + memref.store %10, %2[%13] : memref, #spirv.storage_class> + return } // Check that i64 emulation handles 1-D vector ops and does not introduce @@ -83,34 +92,39 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target> }> -module { - func.func @no_emulation() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %c1_i64 = arith.constant 1 : i64 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<8xi32, #spirv.storage_class> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8xi64, #spirv.storage_class> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<8xi64, #spirv.storage_class> - %3 = memref.load %0[%c0] : memref<8xi32, #spirv.storage_class> - %4 = memref.load %1[%c0] : memref<8xi64, #spirv.storage_class> - %5 = arith.addi %4, %c1_i64 : i64 - memref.store %5, %2[%c0] : memref<8xi64, #spirv.storage_class> - return - } +func.func @no_emulation() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %c1_i64 = arith.constant 1 : i64 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<8xi32, #spirv.storage_class> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<8xi64, #spirv.storage_class> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<8xi64, #spirv.storage_class> + %3 = memref.load %0[%c0] : memref<8xi32, #spirv.storage_class> + %4 = memref.load %1[%c0] : memref<8xi64, #spirv.storage_class> + %5 = arith.addi %4, %c1_i64 : i64 + memref.store %5, %2[%c0] : memref<8xi64, #spirv.storage_class> + return } // Check that with the Int64 capability we do not emulate i64 ops. // // CHECK-LABEL: func.func @no_emulation // CHECK: [[CST1:%.+]] = arith.constant 1 : i64 -// CHECK: [[REF_I32:%.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<8xi32, #spirv.storage_class> -// CHECK: [[REF_I64_0:%.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8xi64, #spirv.storage_class> -// CHECK: [[REF_I64_1:%.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<8xi64, #spirv.storage_class> +// CHECK: [[REF_I32:%.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<8xi32, #spirv.storage_class> +// CHECK: [[REF_I64_0:%.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref<8xi64, #spirv.storage_class> +// CHECK: [[REF_I64_1:%.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) : memref<8xi64, #spirv.storage_class> // CHECK: [[VI32:%.+]] = memref.load [[REF_I32]][{{%.+}}] : memref<8xi32, #spirv.storage_class> // CHECK: [[VI64:%.+]] = memref.load [[REF_I64_0]][{{%.+}}] : memref<8xi64, #spirv.storage_class> // CHECK: {{%.+}} = arith.addi {{%.+}} : i64 diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/erase_storage_buffer_static_shape.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/erase_storage_buffer_static_shape.mlir index e473186cfb475..aa25417ac7f7c 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/erase_storage_buffer_static_shape.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/erase_storage_buffer_static_shape.mlir @@ -1,8 +1,14 @@ // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-spirv-erase-storage-buffer-static-shape))" %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @storage_buffer_load_store(%offset: index, %i0: index, %i1: index) { - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%offset) flags(ReadOnly) : memref<256xf32, #hal.descriptor_type> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%offset) : memref<256xf32, #hal.descriptor_type> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%offset) flags(ReadOnly) : memref<256xf32, #hal.descriptor_type> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%offset) : memref<256xf32, #hal.descriptor_type> %val = memref.load %0[%i0] : memref<256xf32, #hal.descriptor_type> memref.store %val, %1[%i1] : memref<256xf32, #hal.descriptor_type> return @@ -11,8 +17,8 @@ func.func @storage_buffer_load_store(%offset: index, %i0: index, %i1: index) { // CHECK-LABEL: func.func @storage_buffer_load_store // CHECK-SAME: (%[[OFFSET:.+]]: index, %[[I0:.+]]: index, %[[I1:.+]]: index) // CHECK: %[[C256:.+]] = arith.constant 256 : index -// CHECK: %[[SPAN0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%[[OFFSET]]) flags(ReadOnly) : memref>{%[[C256]]} -// CHECK: %[[SPAN1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%[[OFFSET]]) : memref>{%[[C256]]} +// CHECK: %[[SPAN0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%[[OFFSET]]) flags(ReadOnly) : memref>{%[[C256]]} +// CHECK: %[[SPAN1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) alignment(64) offset(%[[OFFSET]]) : memref>{%[[C256]]} // CHECK: %[[LD:.+]] = memref.load %[[SPAN0]][%[[I0]]] // CHECK: memref.store %[[LD]], %[[SPAN1]][%[[I1]]] @@ -20,35 +26,51 @@ func.func @storage_buffer_load_store(%offset: index, %i0: index, %i1: index) { // Test that we don't rewrite memref for uniform buffers. +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @uniform_buffer_load(%offset: index, %i0: index) -> f32 { - %0 = hal.interface.binding.subspan set(0) binding(0) type(uniform_buffer) alignment(64) offset(%offset) flags(ReadOnly) : memref<256xf32, #hal.descriptor_type> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%offset) flags(ReadOnly) : memref<256xf32, #hal.descriptor_type> %val = memref.load %0[%i0] : memref<256xf32, #hal.descriptor_type> return %val : f32 } // CHECK-LABEL: func.func @uniform_buffer_load -// CHECK: %[[SPAN0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(uniform_buffer) alignment(64) offset(%{{.+}}) flags(ReadOnly) : memref<256xf32, #hal.descriptor_type> +// CHECK: %[[SPAN0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%{{.+}}) flags(ReadOnly) : memref<256xf32, #hal.descriptor_type> // CHECK: memref.load %[[SPAN0]] // ----- // Test that we don't rewrite memref without HAL descriptor types. +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @uniform_buffer_load(%offset: index, %i0: index) -> f32 { - %0 = hal.interface.binding.subspan set(0) binding(0) type(uniform_buffer) alignment(64) offset(%offset) flags(ReadOnly) : memref<256xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%offset) flags(ReadOnly) : memref<256xf32> %val = memref.load %0[%i0] : memref<256xf32> return %val : f32 } // CHECK-LABEL: func.func @uniform_buffer_load -// CHECK: %[[SPAN0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(uniform_buffer) alignment(64) offset(%{{.+}}) flags(ReadOnly) : memref<256xf32> +// CHECK: %[[SPAN0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%{{.+}}) flags(ReadOnly) : memref<256xf32> // CHECK: memref.load %[[SPAN0]] // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @storage_buffer_transfer_read_write(%offset: index, %i0: index, %i1: index) { - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%offset) flags(ReadOnly) : memref<256xf32, #hal.descriptor_type> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%offset) : memref<256xf32, #hal.descriptor_type> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%offset) flags(ReadOnly) : memref<256xf32, #hal.descriptor_type> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%offset) : memref<256xf32, #hal.descriptor_type> %f0 = arith.constant 0.0 : f32 %val = vector.transfer_read %0[%i0], %f0 {in_bounds = [true]} : memref<256xf32, #hal.descriptor_type>, vector<4xf32> vector.transfer_write %val, %1[%i1] {in_bounds = [true]} : vector<4xf32>, memref<256xf32, #hal.descriptor_type> @@ -61,9 +83,14 @@ func.func @storage_buffer_transfer_read_write(%offset: index, %i0: index, %i1: i // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @storage_buffer_subview(%offset : index, %i0: index, %i1: index) -> f32 { %c0 = arith.constant 0 : index - %subspan = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%offset) : memref<128xf32, strided<[1], offset: ?>, #hal.descriptor_type> + %subspan = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%offset) : memref<128xf32, strided<[1], offset: ?>, #hal.descriptor_type> %subview = memref.subview %subspan[%i0][16][1] : memref<128xf32, strided<[1], offset: ?>, #hal.descriptor_type> to memref<16xf32, strided<[1], offset: ?>, #hal.descriptor_type> %value = memref.load %subview[%c0] : memref<16xf32, strided<[1], offset: ?>, #hal.descriptor_type> return %value : f32 @@ -74,13 +101,18 @@ func.func @storage_buffer_subview(%offset : index, %i0: index, %i1: index) -> f3 // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @storage_buffer_cast(%offset: index) -> memref> { - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%offset) : memref<16xf32, #hal.descriptor_type> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%offset) : memref<16xf32, #hal.descriptor_type> %1 = memref.cast %0 : memref<16xf32, #hal.descriptor_type> to memref> return %1 : memref> } // CHECK-LABEL: func.func @storage_buffer_cast // CHECK: %[[C16:.+]] = arith.constant 16 : index -// CHECK: %[[SPAN0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%{{.+}}) : memref>{%[[C16]]} +// CHECK: %[[SPAN0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) alignment(64) offset(%{{.+}}) : memref>{%[[C16]]} // CHECK: return %[[SPAN0]] diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir index cc8f964359e71..bb7668854bd60 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir @@ -2,6 +2,13 @@ // RUN: --pass-pipeline='builtin.module(iree-codegen-materialize-user-configs, iree-spirv-select-lowering-strategy-pass)' \ // RUN: --verify-diagnostics --split-input-file %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x8xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<8x16xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x16xf32> - // expected-error @+1 {{expected 1 levels of tiling sizes, got 0}} - linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>) - return - } +func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<4x8xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<8x16xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<4x16xf32> + // expected-error @+1 {{expected 1 levels of tiling sizes, got 0}} + linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<4x8xf32>, memref<8x16xf32>) outs(%2 : memref<4x16xf32>) + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - // expected-error @+1 {{expected workgroup size to have three dimensions for SPIR-V pipelines}} - func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x16xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x128xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<64x128xf32> - linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<64x16xf32>, memref<16x128xf32>) outs(%2 : memref<64x128xf32>) - return - } +// expected-error @+1 {{expected workgroup size to have three dimensions for SPIR-V pipelines}} +func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<64x16xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<16x128xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<64x128xf32> + linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<64x16xf32>, memref<16x128xf32>) outs(%2 : memref<64x128xf32>) + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x16xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x128xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<64x128xf32> - // expected-error @+1 {{expected workgroup size dimensions not exceeding [128, 128, 64]}} - linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<64x16xf32>, memref<16x128xf32>) outs(%2 : memref<64x128xf32>) - return - } +func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<64x16xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<16x128xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<64x128xf32> + // expected-error @+1 {{expected workgroup size dimensions not exceeding [128, 128, 64]}} + linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<64x16xf32>, memref<16x128xf32>) outs(%2 : memref<64x128xf32>) + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x16xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x128xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<64x128xf32> - // expected-error @+1 {{expected total invocation count in workgroup to be <= 128}} - linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<64x16xf32>, memref<16x128xf32>) outs(%2 : memref<64x128xf32>) - return - } +func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<64x16xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<16x128xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<64x128xf32> + // expected-error @+1 {{expected total invocation count in workgroup to be <= 128}} + linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<64x16xf32>, memref<16x128xf32>) outs(%2 : memref<64x128xf32>) + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x16xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x128xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<64x128xf32> - // expected-error @+1 {{expected total workgroup size to be multiple of 32}} - linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<64x16xf32>, memref<16x128xf32>) outs(%2 : memref<64x128xf32>) - return - } +func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<64x16xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<16x128xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<64x128xf32> + // expected-error @+1 {{expected total workgroup size to be multiple of 32}} + linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<64x16xf32>, memref<16x128xf32>) outs(%2 : memref<64x128xf32>) + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x16xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x128xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<64x128xf32> - // expected-error @+1 {{expected each workgroup size dimension to be power of two}} - linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<64x16xf32>, memref<16x128xf32>) outs(%2 : memref<64x128xf32>) - return - } +func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<64x16xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<16x128xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<64x128xf32> + // expected-error @+1 {{expected each workgroup size dimension to be power of two}} + linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<64x16xf32>, memref<16x128xf32>) outs(%2 : memref<64x128xf32>) + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<48x16xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x128xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<48x128xf32> - // expected-error @+1 {{LHS shape is indivisible by first level tile size}} - linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<48x16xf32>, memref<16x128xf32>) outs(%2 : memref<48x128xf32>) - return - } +func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<48x16xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<16x128xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<48x128xf32> + // expected-error @+1 {{LHS shape is indivisible by first level tile size}} + linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<48x16xf32>, memref<16x128xf32>) outs(%2 : memref<48x128xf32>) + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<64x16xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16x80xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<64x80xf32> - // expected-error @+1 {{RHS shape is indivisible by first level tile size}} - linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<64x16xf32>, memref<16x80xf32>) outs(%2 : memref<64x80xf32>) - return - } +func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<64x16xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<16x80xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<64x80xf32> + // expected-error @+1 {{RHS shape is indivisible by first level tile size}} + linalg.matmul {compilation_info = #compilation} ins(%0, %1 : memref<64x16xf32>, memref<16x80xf32>) outs(%2 : memref<64x80xf32>) + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - func.func @matmul_tensor() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x32xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x128xf16> - %5 = tensor.empty() : tensor<64x128xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x128xf16>) -> tensor<64x128xf16> - // expected-error @+1 {{expected 4 levels of tiling sizes, got 3}} - %7 = linalg.matmul {compilation_info = #compilation} ins(%3, %4 : tensor<64x32xf16>, tensor<32x128xf16>) outs(%6 : tensor<64x128xf16>) -> tensor<64x128xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : tensor<64x128xf16> -> !flow.dispatch.tensor> - return - } +func.func @matmul_tensor() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x32xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x128xf16> + %5 = tensor.empty() : tensor<64x128xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x128xf16>) -> tensor<64x128xf16> + // expected-error @+1 {{expected 4 levels of tiling sizes, got 3}} + %7 = linalg.matmul {compilation_info = #compilation} ins(%3, %4 : tensor<64x32xf16>, tensor<32x128xf16>) outs(%6 : tensor<64x128xf16>) -> tensor<64x128xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : tensor<64x128xf16> -> !flow.dispatch.tensor> + return } // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - func.func @matmul_tensor() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x32xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x128xf16> - %5 = tensor.empty() : tensor<64x128xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x128xf16>) -> tensor<64x128xf16> - // expected-error @+1 {{expected the fourth level tile sizes to match cooperative matrix sizes}} - %7 = linalg.matmul {compilation_info = #compilation} ins(%3, %4 : tensor<64x32xf16>, tensor<32x128xf16>) outs(%6 : tensor<64x128xf16>) -> tensor<64x128xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : tensor<64x128xf16> -> !flow.dispatch.tensor> - return - } +func.func @matmul_tensor() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x32xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x128xf16> + %5 = tensor.empty() : tensor<64x128xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x128xf16>) -> tensor<64x128xf16> + // expected-error @+1 {{expected the fourth level tile sizes to match cooperative matrix sizes}} + %7 = linalg.matmul {compilation_info = #compilation} ins(%3, %4 : tensor<64x32xf16>, tensor<32x128xf16>) outs(%6 : tensor<64x128xf16>) -> tensor<64x128xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : tensor<64x128xf16> -> !flow.dispatch.tensor> + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - func.func @matmul_tensor() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x32xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x128xf16> - %5 = tensor.empty() : tensor<64x128xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x128xf16>) -> tensor<64x128xf16> - // expected-error @+1 {{expected subgroup tile sizes to be multiple of [16, 16, 16]}} - %7 = linalg.matmul {compilation_info = #compilation} ins(%3, %4 : tensor<64x32xf16>, tensor<32x128xf16>) outs(%6 : tensor<64x128xf16>) -> tensor<64x128xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : tensor<64x128xf16> -> !flow.dispatch.tensor> - return - } +func.func @matmul_tensor() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x32xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x128xf16> + %5 = tensor.empty() : tensor<64x128xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x128xf16>) -> tensor<64x128xf16> + // expected-error @+1 {{expected subgroup tile sizes to be multiple of [16, 16, 16]}} + %7 = linalg.matmul {compilation_info = #compilation} ins(%3, %4 : tensor<64x32xf16>, tensor<32x128xf16>) outs(%6 : tensor<64x128xf16>) -> tensor<64x128xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : tensor<64x128xf16> -> !flow.dispatch.tensor> + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - func.func @matmul_tensor() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x32xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x128xf16> - %5 = tensor.empty() : tensor<64x128xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x128xf16>) -> tensor<64x128xf16> - // expected-error @+1 {{expected workgroup x component equals to (warp_size * wg_tile_n / subgroup_tile_n)}} - %7 = linalg.matmul {compilation_info = #compilation} ins(%3, %4 : tensor<64x32xf16>, tensor<32x128xf16>) outs(%6 : tensor<64x128xf16>) -> tensor<64x128xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : tensor<64x128xf16> -> !flow.dispatch.tensor> - return - } +func.func @matmul_tensor() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x32xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x128xf16> + %5 = tensor.empty() : tensor<64x128xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x128xf16>) -> tensor<64x128xf16> + // expected-error @+1 {{expected workgroup x component equals to (warp_size * wg_tile_n / subgroup_tile_n)}} + %7 = linalg.matmul {compilation_info = #compilation} ins(%3, %4 : tensor<64x32xf16>, tensor<32x128xf16>) outs(%6 : tensor<64x128xf16>) -> tensor<64x128xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : tensor<64x128xf16> -> !flow.dispatch.tensor> + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - func.func @matmul_tensor() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x32xf16> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x128xf16> - %5 = tensor.empty() : tensor<64x128xf16> - %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x128xf16>) -> tensor<64x128xf16> - // expected-error @+1 {{expected workgroup y component equals to (wg_tile_m / subgroup_tile_m)}} - %7 = linalg.matmul {compilation_info = #compilation} ins(%3, %4 : tensor<64x32xf16>, tensor<32x128xf16>) outs(%6 : tensor<64x128xf16>) -> tensor<64x128xf16> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : tensor<64x128xf16> -> !flow.dispatch.tensor> - return - } +func.func @matmul_tensor() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [64, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x32xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x128xf16> + %5 = tensor.empty() : tensor<64x128xf16> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<64x128xf16>) -> tensor<64x128xf16> + // expected-error @+1 {{expected workgroup y component equals to (wg_tile_m / subgroup_tile_m)}} + %7 = linalg.matmul {compilation_info = #compilation} ins(%3, %4 : tensor<64x32xf16>, tensor<32x128xf16>) outs(%6 : tensor<64x128xf16>) -> tensor<64x128xf16> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [64, 128], strides = [1, 1] : tensor<64x128xf16> -> !flow.dispatch.tensor> + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target (d0 * 2)> #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c112 = arith.constant 112 : index - %c16 = arith.constant 16 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %workgroup_id_z = hal.interface.workgroup.id[2] : index - %workgroup_count_z = hal.interface.workgroup.count[2] : index - %3 = affine.apply #map()[%workgroup_id_z] - %4 = affine.apply #map()[%workgroup_count_z] - scf.for %arg0 = %3 to %c112 step %4 { - %5 = affine.apply #map()[%workgroup_id_y] - %6 = affine.apply #map()[%workgroup_count_y] - scf.for %arg1 = %5 to %c112 step %6 { - %7 = affine.apply #map1()[%workgroup_id_x] - %8 = affine.apply #map1()[%workgroup_count_x] - scf.for %arg2 = %7 to %c16 step %8 { - %9 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x4x4x16xf32> - %10 = affine.apply #map2(%arg0) - %11 = affine.apply #map2(%arg1) - %12 = flow.dispatch.tensor.load %0, offsets = [0, %10, %11, 0], sizes = [1, 9, 9, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x9x9x8xf32> - %13 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 8, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x8x16xf32> - %14 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32> - // expected-error @+1 {{expected 4 levels of tiling sizes, got 3}} - %15 = linalg.conv_2d_nhwc_hwcf {compilation_info = #compilation, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%12, %13 : tensor<1x9x9x8xf32>, tensor<3x3x8x16xf32>) outs(%14 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32> - flow.dispatch.tensor.store %15, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : tensor<1x4x4x16xf32> -> !flow.dispatch.tensor> - } +func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c112 = arith.constant 112 : index + %c16 = arith.constant 16 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %workgroup_id_z = hal.interface.workgroup.id[2] : index + %workgroup_count_z = hal.interface.workgroup.count[2] : index + %3 = affine.apply #map()[%workgroup_id_z] + %4 = affine.apply #map()[%workgroup_count_z] + scf.for %arg0 = %3 to %c112 step %4 { + %5 = affine.apply #map()[%workgroup_id_y] + %6 = affine.apply #map()[%workgroup_count_y] + scf.for %arg1 = %5 to %c112 step %6 { + %7 = affine.apply #map1()[%workgroup_id_x] + %8 = affine.apply #map1()[%workgroup_count_x] + scf.for %arg2 = %7 to %c16 step %8 { + %9 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x4x4x16xf32> + %10 = affine.apply #map2(%arg0) + %11 = affine.apply #map2(%arg1) + %12 = flow.dispatch.tensor.load %0, offsets = [0, %10, %11, 0], sizes = [1, 9, 9, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x9x9x8xf32> + %13 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 8, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x8x16xf32> + %14 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32> + // expected-error @+1 {{expected 4 levels of tiling sizes, got 3}} + %15 = linalg.conv_2d_nhwc_hwcf {compilation_info = #compilation, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%12, %13 : tensor<1x9x9x8xf32>, tensor<3x3x8x16xf32>) outs(%14 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32> + flow.dispatch.tensor.store %15, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : tensor<1x4x4x16xf32> -> !flow.dispatch.tensor> } } - return } + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target (d0 * 2)> #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c112 = arith.constant 112 : index - %c16 = arith.constant 16 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %workgroup_id_z = hal.interface.workgroup.id[2] : index - %workgroup_count_z = hal.interface.workgroup.count[2] : index - %3 = affine.apply #map()[%workgroup_id_z] - %4 = affine.apply #map()[%workgroup_count_z] - scf.for %arg0 = %3 to %c112 step %4 { - %5 = affine.apply #map()[%workgroup_id_y] - %6 = affine.apply #map()[%workgroup_count_y] - scf.for %arg1 = %5 to %c112 step %6 { - %7 = affine.apply #map1()[%workgroup_id_x] - %8 = affine.apply #map1()[%workgroup_count_x] - scf.for %arg2 = %7 to %c16 step %8 { - %9 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x4x4x16xf32> - %10 = affine.apply #map2(%arg0) - %11 = affine.apply #map2(%arg1) - %12 = flow.dispatch.tensor.load %0, offsets = [0, %10, %11, 0], sizes = [1, 9, 9, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x9x9x8xf32> - %13 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 8, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x8x16xf32> - %14 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32> - // expected-error @+1 {{expected first level tile size divides the output size [OH, OW, OC]}} - %15 = linalg.conv_2d_nhwc_hwcf {compilation_info = #compilation, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%12, %13 : tensor<1x9x9x8xf32>, tensor<3x3x8x16xf32>) outs(%14 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32> - flow.dispatch.tensor.store %15, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : tensor<1x4x4x16xf32> -> !flow.dispatch.tensor> - } +func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c112 = arith.constant 112 : index + %c16 = arith.constant 16 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %workgroup_id_z = hal.interface.workgroup.id[2] : index + %workgroup_count_z = hal.interface.workgroup.count[2] : index + %3 = affine.apply #map()[%workgroup_id_z] + %4 = affine.apply #map()[%workgroup_count_z] + scf.for %arg0 = %3 to %c112 step %4 { + %5 = affine.apply #map()[%workgroup_id_y] + %6 = affine.apply #map()[%workgroup_count_y] + scf.for %arg1 = %5 to %c112 step %6 { + %7 = affine.apply #map1()[%workgroup_id_x] + %8 = affine.apply #map1()[%workgroup_count_x] + scf.for %arg2 = %7 to %c16 step %8 { + %9 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x4x4x16xf32> + %10 = affine.apply #map2(%arg0) + %11 = affine.apply #map2(%arg1) + %12 = flow.dispatch.tensor.load %0, offsets = [0, %10, %11, 0], sizes = [1, 9, 9, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x9x9x8xf32> + %13 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 8, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x8x16xf32> + %14 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32> + // expected-error @+1 {{expected first level tile size divides the output size [OH, OW, OC]}} + %15 = linalg.conv_2d_nhwc_hwcf {compilation_info = #compilation, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%12, %13 : tensor<1x9x9x8xf32>, tensor<3x3x8x16xf32>) outs(%14 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32> + flow.dispatch.tensor.store %15, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : tensor<1x4x4x16xf32> -> !flow.dispatch.tensor> } } - return } + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target (d0 * 2)> #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c112 = arith.constant 112 : index - %c16 = arith.constant 16 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %workgroup_id_z = hal.interface.workgroup.id[2] : index - %workgroup_count_z = hal.interface.workgroup.count[2] : index - %3 = affine.apply #map()[%workgroup_id_z] - %4 = affine.apply #map()[%workgroup_count_z] - scf.for %arg0 = %3 to %c112 step %4 { - %5 = affine.apply #map()[%workgroup_id_y] - %6 = affine.apply #map()[%workgroup_count_y] - scf.for %arg1 = %5 to %c112 step %6 { - %7 = affine.apply #map1()[%workgroup_id_x] - %8 = affine.apply #map1()[%workgroup_count_x] - scf.for %arg2 = %7 to %c16 step %8 { - %9 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x4x4x16xf32> - %10 = affine.apply #map2(%arg0) - %11 = affine.apply #map2(%arg1) - %12 = flow.dispatch.tensor.load %0, offsets = [0, %10, %11, 0], sizes = [1, 9, 9, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x9x9x8xf32> - %13 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 8, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x8x16xf32> - %14 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32> - // expected-error @+1 {{expected workgroup tile sizes to be the product of thread tile size and workgroup size}} - %15 = linalg.conv_2d_nhwc_hwcf {compilation_info = #compilation, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%12, %13 : tensor<1x9x9x8xf32>, tensor<3x3x8x16xf32>) outs(%14 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32> - flow.dispatch.tensor.store %15, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : tensor<1x4x4x16xf32> -> !flow.dispatch.tensor> - } +func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c112 = arith.constant 112 : index + %c16 = arith.constant 16 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %workgroup_id_z = hal.interface.workgroup.id[2] : index + %workgroup_count_z = hal.interface.workgroup.count[2] : index + %3 = affine.apply #map()[%workgroup_id_z] + %4 = affine.apply #map()[%workgroup_count_z] + scf.for %arg0 = %3 to %c112 step %4 { + %5 = affine.apply #map()[%workgroup_id_y] + %6 = affine.apply #map()[%workgroup_count_y] + scf.for %arg1 = %5 to %c112 step %6 { + %7 = affine.apply #map1()[%workgroup_id_x] + %8 = affine.apply #map1()[%workgroup_count_x] + scf.for %arg2 = %7 to %c16 step %8 { + %9 = flow.dispatch.tensor.load %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x4x4x16xf32> + %10 = affine.apply #map2(%arg0) + %11 = affine.apply #map2(%arg1) + %12 = flow.dispatch.tensor.load %0, offsets = [0, %10, %11, 0], sizes = [1, 9, 9, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x9x9x8xf32> + %13 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 8, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x8x16xf32> + %14 = linalg.fill ins(%cst : f32) outs(%9 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32> + // expected-error @+1 {{expected workgroup tile sizes to be the product of thread tile size and workgroup size}} + %15 = linalg.conv_2d_nhwc_hwcf {compilation_info = #compilation, dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%12, %13 : tensor<1x9x9x8xf32>, tensor<3x3x8x16xf32>) outs(%14 : tensor<1x4x4x16xf32>) -> tensor<1x4x4x16xf32> + flow.dispatch.tensor.store %15, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, 4, 4, 16], strides = [1, 1, 1, 1] : tensor<1x4x4x16xf32> -> !flow.dispatch.tensor> } } - return } + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1x11x11x576xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<5x5x576xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1x7x7x576xf32> - // expected-error @+1 {{expected tile sizes for KH and KW to be 1}} - linalg.depthwise_conv_2d_nhwc_hwc {compilation_info = #compilation} ins(%0, %1 : memref<1x11x11x576xf32>, memref<5x5x576xf32>) outs(%2 : memref<1x7x7x576xf32>) - return - } +func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<1x11x11x576xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<5x5x576xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<1x7x7x576xf32> + // expected-error @+1 {{expected tile sizes for KH and KW to be 1}} + linalg.depthwise_conv_2d_nhwc_hwc {compilation_info = #compilation} ins(%0, %1 : memref<1x11x11x576xf32>, memref<5x5x576xf32>) outs(%2 : memref<1x7x7x576xf32>) + return } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1x11x11x576xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<5x5x576xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<1x7x7x576xf32> - // expected-error @+1 {{expected the fourth level of tile size to be [0, 1, 0, 0]}} - linalg.depthwise_conv_2d_nhwc_hwc {compilation_info = #compilation} ins(%0, %1 : memref<1x11x11x576xf32>, memref<5x5x576xf32>) outs(%2 : memref<1x7x7x576xf32>) - return - } +func.func @illegal() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<1x11x11x576xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<5x5x576xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<1x7x7x576xf32> + // expected-error @+1 {{expected the fourth level of tile size to be [0, 1, 0, 0]}} + linalg.depthwise_conv_2d_nhwc_hwc {compilation_info = #compilation} ins(%0, %1 : memref<1x11x11x576xf32>, memref<5x5x576xf32>) outs(%2 : memref<1x7x7x576xf32>) + return } diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_fusion.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_fusion.mlir index 55bb2be6bdcda..582890159f525 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_fusion.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_fusion.mlir @@ -1,5 +1,14 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=cdna2@vulkan --pass-pipeline='builtin.module(iree-codegen-spirv-configuration-pipeline, func.func(iree-spirv-lower-executable-target-pass))' %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 32)> #map1 = affine_map<()[s0] -> (s0 * 128)> @@ -10,63 +19,61 @@ #map6 = affine_map<(d0, d1, d2, d3) -> (d0, d1)> #translation = #iree_codegen.translation_info #compilation = #iree_codegen.compilation_info -module { - func.func @matmul_i4_quant_weight() { - %c32 = arith.constant 32 : index - %c128 = arith.constant 128 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %5 = affine.apply #map()[%workgroup_id_y] - %6 = flow.dispatch.tensor.load %3, offsets = [%5, 0, 0], sizes = [%c32, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor - %7 = affine.apply #map1()[%workgroup_id_x] - %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, %7], sizes = [86, 128, %c128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<86x128x?xi4> - %9 = affine.apply #map1()[%workgroup_id_x] - %10 = flow.dispatch.tensor.load %1, offsets = [0, %9], sizes = [86, %c128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<86x?xf32> - %11 = affine.apply #map1()[%workgroup_id_x] - %12 = flow.dispatch.tensor.load %2, offsets = [0, %11], sizes = [86, %c128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<86x?xi4> - %13 = tensor.empty() : tensor<86x128x128xf32> - %cast = tensor.cast %8 : tensor<86x128x?xi4> to tensor<86x128x128xi4> - %cast_0 = tensor.cast %10 : tensor<86x?xf32> to tensor<86x128xf32> - %cast_1 = tensor.cast %12 : tensor<86x?xi4> to tensor<86x128xi4> - %14 = linalg.generic {indexing_maps = [#map2, #map3, #map3, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cast, %cast_0, %cast_1 : tensor<86x128x128xi4>, tensor<86x128xf32>, tensor<86x128xi4>) outs(%13 : tensor<86x128x128xf32>) { - ^bb0(%in: i4, %in_4: f32, %in_5: i4, %out: f32): - %20 = arith.extsi %in : i4 to i32 - %21 = arith.extsi %in_5 : i4 to i32 - %22 = arith.subi %20, %21 : i32 - %23 = arith.sitofp %22 : i32 to f32 - %24 = arith.mulf %23, %in_4 : f32 - linalg.yield %24 : f32 - } -> tensor<86x128x128xf32> - %15 = tensor.empty() : tensor<32x128xf32> - %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<32x128xf32>) -> tensor<32x128xf32> - %cast_2 = tensor.cast %6 : tensor to tensor<32x86x128xf32> - %17 = linalg.generic {indexing_maps = [#map4, #map5, #map6], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%cast_2, %14 : tensor<32x86x128xf32>, tensor<86x128x128xf32>) outs(%16 : tensor<32x128xf32>) attrs = {compilation_info = #compilation} { - ^bb0(%in: f32, %in_4: f32, %out: f32): - %20 = arith.mulf %in, %in_4 : f32 - %21 = arith.addf %out, %20 : f32 - linalg.yield %21 : f32 - } -> tensor<32x128xf32> - %cast_3 = tensor.cast %17 : tensor<32x128xf32> to tensor - %18 = affine.apply #map()[%workgroup_id_y] - %19 = affine.apply #map1()[%workgroup_id_x] - flow.dispatch.tensor.store %cast_3, %4, offsets = [%18, %19], sizes = [%c32, %c128], strides = [1, 1] : tensor -> !flow.dispatch.tensor> - return - } +func.func @matmul_i4_quant_weight() { + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %5 = affine.apply #map()[%workgroup_id_y] + %6 = flow.dispatch.tensor.load %3, offsets = [%5, 0, 0], sizes = [%c32, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor + %7 = affine.apply #map1()[%workgroup_id_x] + %8 = flow.dispatch.tensor.load %0, offsets = [0, 0, %7], sizes = [86, 128, %c128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<86x128x?xi4> + %9 = affine.apply #map1()[%workgroup_id_x] + %10 = flow.dispatch.tensor.load %1, offsets = [0, %9], sizes = [86, %c128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<86x?xf32> + %11 = affine.apply #map1()[%workgroup_id_x] + %12 = flow.dispatch.tensor.load %2, offsets = [0, %11], sizes = [86, %c128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<86x?xi4> + %13 = tensor.empty() : tensor<86x128x128xf32> + %cast = tensor.cast %8 : tensor<86x128x?xi4> to tensor<86x128x128xi4> + %cast_0 = tensor.cast %10 : tensor<86x?xf32> to tensor<86x128xf32> + %cast_1 = tensor.cast %12 : tensor<86x?xi4> to tensor<86x128xi4> + %14 = linalg.generic {indexing_maps = [#map2, #map3, #map3, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cast, %cast_0, %cast_1 : tensor<86x128x128xi4>, tensor<86x128xf32>, tensor<86x128xi4>) outs(%13 : tensor<86x128x128xf32>) { + ^bb0(%in: i4, %in_4: f32, %in_5: i4, %out: f32): + %20 = arith.extsi %in : i4 to i32 + %21 = arith.extsi %in_5 : i4 to i32 + %22 = arith.subi %20, %21 : i32 + %23 = arith.sitofp %22 : i32 to f32 + %24 = arith.mulf %23, %in_4 : f32 + linalg.yield %24 : f32 + } -> tensor<86x128x128xf32> + %15 = tensor.empty() : tensor<32x128xf32> + %16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<32x128xf32>) -> tensor<32x128xf32> + %cast_2 = tensor.cast %6 : tensor to tensor<32x86x128xf32> + %17 = linalg.generic {indexing_maps = [#map4, #map5, #map6], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%cast_2, %14 : tensor<32x86x128xf32>, tensor<86x128x128xf32>) outs(%16 : tensor<32x128xf32>) attrs = {compilation_info = #compilation} { + ^bb0(%in: f32, %in_4: f32, %out: f32): + %20 = arith.mulf %in, %in_4 : f32 + %21 = arith.addf %out, %20 : f32 + linalg.yield %21 : f32 + } -> tensor<32x128xf32> + %cast_3 = tensor.cast %17 : tensor<32x128xf32> to tensor + %18 = affine.apply #map()[%workgroup_id_y] + %19 = affine.apply #map1()[%workgroup_id_x] + flow.dispatch.tensor.store %cast_3, %4, offsets = [%18, %19], sizes = [%c32, %c128], strides = [1, 1] : tensor -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @matmul_i4_quant_weight() // CHECK: %[[A_ALLOC:.+]] = memref.alloc() : memref<32x1x36xf32, #gpu.address_space> // CHECK: %[[B_ALLOC:.+]] = memref.alloc() : memref<1x32x132xf32, #gpu.address_space> -// CHECK: %[[WEIGHT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK: %[[SCALE_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) -// CHECK: %[[ZP_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) +// CHECK: %[[WEIGHT_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK: %[[SCALE_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK: %[[ZP_BINDING:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK: scf.for %arg0 = %c0 to %c86 step %c1 iter_args({{.+}}) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) // CHECK: %[[SCALE0:.+]] = vector.transfer_read %[[SCALE_BINDING]] // CHECK: %[[SCALE1:.+]] = vector.transfer_read %[[SCALE_BINDING]] diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_promotion.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_promotion.mlir index 838d95c7186a4..274669dc5897f 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_promotion.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_promotion.mlir @@ -6,9 +6,6 @@ // Verify pipelining + multi-buffering. -#compilation = #iree_codegen.compilation_info< - lowering_config = #iree_codegen.lowering_config, - translation_info = > #pipeline_layout = #hal.pipeline.layout, @@ -17,8 +14,10 @@ #hal.descriptor_set.binding<3, storage_buffer> ]> ]> +#compilation = #iree_codegen.compilation_info< + lowering_config = #iree_codegen.lowering_config, + translation_info = > #map = affine_map<(d0, d1) -> (d0, d1)> - hal.executable @matmul_f32_128x256x64 { hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb">) { hal.executable.export public @matmul_f32_128x256x64 ordinal(0) layout(#pipeline_layout) { @@ -30,10 +29,10 @@ hal.executable @matmul_f32_128x256x64 { func.func @matmul_f32_128x256x64() { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x512xf32> %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x256xf32> %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf32> @@ -95,9 +94,6 @@ hal.executable @matmul_f32_128x256x64 { // Store in stage 0 of pipeline. -#compilation = #iree_codegen.compilation_info< - lowering_config = #iree_codegen.lowering_config, - translation_info = > #pipeline_layout = #hal.pipeline.layout, @@ -106,8 +102,10 @@ hal.executable @matmul_f32_128x256x64 { #hal.descriptor_set.binding<3, storage_buffer> ]> ]> +#compilation = #iree_codegen.compilation_info< + lowering_config = #iree_codegen.lowering_config, + translation_info = > #map = affine_map<(d0, d1) -> (d0, d1)> - hal.executable @matmul_f32_128x256x64 { hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb">) { hal.executable.export public @matmul_f32_128x256x64 ordinal(0) layout(#pipeline_layout) { @@ -119,10 +117,10 @@ hal.executable @matmul_f32_128x256x64 { func.func @matmul_f32_128x256x64() { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x512xf32> %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x256xf32> %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf32> @@ -197,9 +195,6 @@ hal.executable @matmul_f32_128x256x64 { // Check that fused transposed consumer elementwise op does not cause extra workgroup memory allocations. -#compilation = #iree_codegen.compilation_info< - lowering_config = #iree_codegen.lowering_config, - translation_info = > #pipeline_layout = #hal.pipeline.layout, @@ -208,7 +203,9 @@ hal.executable @matmul_f32_128x256x64 { #hal.descriptor_set.binding<3, storage_buffer> ]> ]> - +#compilation = #iree_codegen.compilation_info< + lowering_config = #iree_codegen.lowering_config, + translation_info = > hal.executable @matmul_f16_4096x512x512 { hal.executable.variant public @vulkan_spirv_fb target(<"vulkan-spirv", "vulkan-spirv-fb">) { hal.executable.export public @matmul_f16_4096x512x512 ordinal(0) layout(#pipeline_layout) { @@ -220,10 +217,10 @@ hal.executable @matmul_f16_4096x512x512 { func.func @matmul_f16_4096x512x512() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4096, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x512xf16> %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x512xf16> %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [512], strides = [1] : !flow.dispatch.tensor> -> tensor<512xf16> diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matvec.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matvec.mlir index 90e8dc361241b..0948ba9c9aa5b 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matvec.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matvec.mlir @@ -1,41 +1,48 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=cdna2@vulkan --pass-pipeline='builtin.module(iree-spirv-select-lowering-strategy-pass, func.func(iree-spirv-lower-executable-target-pass))' %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer> + ]> +]> #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> #map2 = affine_map<(d0, d1, d2) -> (d1, d2)> #map3 = affine_map<(d0, d1, d2) -> (d0)> -module { - func.func @i4_dequant_matvec_f32() { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor> - %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x128xi4> - %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf32> - %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf32> - %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [86, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<86x128xf32> - %9 = tensor.empty() : tensor<4096xf32> - %10 = tensor.empty() : tensor<4096x86x128xf32> - %11 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4096xf32>) -> tensor<4096xf32> - %12 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%10 : tensor<4096x86x128xf32>) { - ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32): - %14 = arith.extui %in : i4 to i32 - %15 = arith.uitofp %14 : i32 to f32 - %16 = arith.subf %15, %in_1 : f32 - %17 = arith.mulf %16, %in_0 : f32 - linalg.yield %17 : f32 - } -> tensor<4096x86x128xf32> - %13 = linalg.generic {indexing_maps = [#map2, #map, #map3], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<86x128xf32>, tensor<4096x86x128xf32>) outs(%11 : tensor<4096xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %14 = arith.mulf %in, %in_0 : f32 - %15 = arith.addf %14, %out : f32 - linalg.yield %15 : f32 - } -> tensor<4096xf32> - flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf32> -> !flow.dispatch.tensor> - return - } +func.func @i4_dequant_matvec_f32() { + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) : !flow.dispatch.tensor> + %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x128xi4> + %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf32> + %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf32> + %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [86, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<86x128xf32> + %9 = tensor.empty() : tensor<4096xf32> + %10 = tensor.empty() : tensor<4096x86x128xf32> + %11 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4096xf32>) -> tensor<4096xf32> + %12 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5, %6, %7 : tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%10 : tensor<4096x86x128xf32>) { + ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32): + %14 = arith.extui %in : i4 to i32 + %15 = arith.uitofp %14 : i32 to f32 + %16 = arith.subf %15, %in_1 : f32 + %17 = arith.mulf %16, %in_0 : f32 + linalg.yield %17 : f32 + } -> tensor<4096x86x128xf32> + %13 = linalg.generic {indexing_maps = [#map2, #map, #map3], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %12 : tensor<86x128xf32>, tensor<4096x86x128xf32>) outs(%11 : tensor<4096xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %14 = arith.mulf %in, %in_0 : f32 + %15 = arith.addf %14, %out : f32 + linalg.yield %15 : f32 + } -> tensor<4096xf32> + flow.dispatch.tensor.store %13, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @i4_dequant_matvec_f32() diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_reduction.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_reduction.mlir index 9576c054f5224..243a361dc81b2 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_reduction.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_reduction.mlir @@ -2,6 +2,12 @@ // RUN: --pass-pipeline='builtin.module(func.func(iree-codegen-decompose-softmax), iree-spirv-select-lowering-strategy-pass, func.func(iree-spirv-lower-executable-target-pass))' \ // RUN: %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d0)> -module { - func.func @warp_reduction_dispatch() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %c10240 = arith.constant 10240 : index - %cst = arith.constant 1.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 10240], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x10240xf32> - %3 = tensor.empty() : tensor<512xf32> - %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<512xf32>) -> tensor<512xf32> - %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<512x10240xf32>) outs(%4 : tensor<512xf32>) { - ^bb0(%in: f32, %out: f32): - %6 = arith.addf %in, %out : f32 - linalg.yield %6 : f32 - } -> tensor<512xf32> - flow.dispatch.tensor.store %5, %1, offsets = [0], sizes = [512], strides = [1] : tensor<512xf32> -> !flow.dispatch.tensor> - return - } +func.func @warp_reduction_dispatch() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %c10240 = arith.constant 10240 : index + %cst = arith.constant 1.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [512, 10240], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x10240xf32> + %3 = tensor.empty() : tensor<512xf32> + %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<512xf32>) -> tensor<512xf32> + %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%2 : tensor<512x10240xf32>) outs(%4 : tensor<512xf32>) { + ^bb0(%in: f32, %out: f32): + %6 = arith.addf %in, %out : f32 + linalg.yield %6 : f32 + } -> tensor<512xf32> + flow.dispatch.tensor.store %5, %1, offsets = [0], sizes = [512], strides = [1] : tensor<512xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @warp_reduction_dispatch @@ -86,6 +90,12 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d1)> -module { - func.func @warp_reduction_dispatch() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 9216, 9216], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<10x9216x9216xf16> - %3 = tensor.empty() : tensor<10x9216x9216xf16> - %4 = tensor.empty() : tensor<10x9216xf16> - %5 = linalg.fill ins(%cst : f16) outs(%4 : tensor<10x9216xf16>) -> tensor<10x9216xf16> - %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<10x9216x9216xf16>) outs(%5 : tensor<10x9216xf16>) { - ^bb0(%in: f16, %out: f16): - %8 = arith.addf %in, %out : f16 - linalg.yield %8 : f16 - } -> tensor<10x9216xf16> - %7 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2, %6 : tensor<10x9216x9216xf16>, tensor<10x9216xf16>) outs(%3 : tensor<10x9216x9216xf16>) { - ^bb0(%in: f16, %in_0: f16, %out: f16): - %8 = arith.divf %in, %in_0 : f16 - linalg.yield %8 : f16 - } -> tensor<10x9216x9216xf16> - flow.dispatch.tensor.store %7, %1, offsets = [0, 0, 0], sizes = [10, 9216, 9216], strides = [1, 1, 1] : tensor<10x9216x9216xf16> -> !flow.dispatch.tensor> - return - } +func.func @warp_reduction_dispatch() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 9216, 9216], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<10x9216x9216xf16> + %3 = tensor.empty() : tensor<10x9216x9216xf16> + %4 = tensor.empty() : tensor<10x9216xf16> + %5 = linalg.fill ins(%cst : f16) outs(%4 : tensor<10x9216xf16>) -> tensor<10x9216xf16> + %6 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<10x9216x9216xf16>) outs(%5 : tensor<10x9216xf16>) { + ^bb0(%in: f16, %out: f16): + %8 = arith.addf %in, %out : f16 + linalg.yield %8 : f16 + } -> tensor<10x9216xf16> + %7 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2, %6 : tensor<10x9216x9216xf16>, tensor<10x9216xf16>) outs(%3 : tensor<10x9216x9216xf16>) { + ^bb0(%in: f16, %in_0: f16, %out: f16): + %8 = arith.divf %in, %in_0 : f16 + linalg.yield %8 : f16 + } -> tensor<10x9216x9216xf16> + flow.dispatch.tensor.store %7, %1, offsets = [0, 0, 0], sizes = [10, 9216, 9216], strides = [1, 1, 1] : tensor<10x9216x9216xf16> -> !flow.dispatch.tensor> + return } // Check fused elementwise ops @@ -140,8 +148,8 @@ module { // CHECK-DAG: %[[WGIDY:.+]] = hal.interface.workgroup.id[1] : index // CHECK-DAG: %[[TIDX:.+]] = gpu.thread_id x -// CHECK-DAG: %[[SPAN0:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[SPAN1:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK-DAG: %[[SPAN0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[SPAN1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK: gpu.barrier // CHECK: %{{.+}}, %{{.+}} = gpu.shuffle xor %{{.+}}, %[[I1]], %[[I32]] : i32 @@ -165,26 +173,30 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target> }> -module { - func.func @softmax() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c0 = arith.constant 0 : index - %cst = arith.constant -3.40282347E+38 : f32 - %cst_0 = arith.constant 0.000000e+00 : f32 - %cst_1 = arith.constant 1.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<12x128x40960xf32> - %3 = tensor.empty() : tensor<12x128x40960xf32> - %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32> - flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : tensor<12x128x40960xf32> -> !flow.dispatch.tensor> - return - } +func.func @softmax() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c0 = arith.constant 0 : index + %cst = arith.constant -3.40282347E+38 : f32 + %cst_0 = arith.constant 0.000000e+00 : f32 + %cst_1 = arith.constant 1.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<12x128x40960xf32> + %3 = tensor.empty() : tensor<12x128x40960xf32> + %4 = linalg.softmax dimension(2) ins(%2 : tensor<12x128x40960xf32>) outs(%3 : tensor<12x128x40960xf32>) -> tensor<12x128x40960xf32> + flow.dispatch.tensor.store %4, %1, offsets = [0, 0, 0], sizes = [12, 128, 40960], strides = [1, 1, 1] : tensor<12x128x40960xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func.func @softmax @@ -268,31 +280,35 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb", { iree.gpu.target = #iree_gpu.target> }> -module { - func.func @dynamic_softmax() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { - %c32_i64 = arith.constant 32 : i64 - %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = arith.extui %0 : i32 to i64 - %3 = arith.extui %1 : i32 to i64 - %4 = arith.shli %3, %c32_i64 : i64 - %5 = arith.ori %2, %4 : i64 - %6 = arith.index_castui %5 : i64 to index - %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%6} - %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%6} - %10 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [32, %6], strides = [1, 1] : !flow.dispatch.tensor>{%6} -> tensor<32x?xf16> - %11 = tensor.empty(%6) : tensor<32x?xf16> - %12 = linalg.softmax dimension(1) ins(%10 : tensor<32x?xf16>) outs(%11 : tensor<32x?xf16>) -> tensor<32x?xf16> - flow.dispatch.tensor.store %12, %9, offsets = [0, 0], sizes = [32, %6], strides = [1, 1] : tensor<32x?xf16> -> !flow.dispatch.tensor>{%6} - return - } +func.func @dynamic_softmax() attributes {hal.executable.target = #executable_target_vulkan_spirv_fb} { + %c32_i64 = arith.constant 32 : i64 + %c0 = arith.constant 0 : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = arith.extui %0 : i32 to i64 + %3 = arith.extui %1 : i32 to i64 + %4 = arith.shli %3, %c32_i64 : i64 + %5 = arith.ori %2, %4 : i64 + %6 = arith.index_castui %5 : i64 to index + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%6} + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%6} + %10 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [32, %6], strides = [1, 1] : !flow.dispatch.tensor>{%6} -> tensor<32x?xf16> + %11 = tensor.empty(%6) : tensor<32x?xf16> + %12 = linalg.softmax dimension(1) ins(%10 : tensor<32x?xf16>) outs(%11 : tensor<32x?xf16>) -> tensor<32x?xf16> + flow.dispatch.tensor.store %12, %9, offsets = [0, 0], sizes = [32, %6], strides = [1, 1] : tensor<32x?xf16> -> !flow.dispatch.tensor>{%6} + return } // CHECK-LABEL: func.func @dynamic_softmax @@ -302,8 +318,8 @@ module { // CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index // CHECK-DAG: %[[C0_F16:.+]] = arith.constant 0.000000e+00 : f16 -// CHECK: %[[DIM_LBITS:.+]] = hal.interface.constant.load[0] : i32 -// CHECK: %[[DIM_UBITS:.+]] = hal.interface.constant.load[1] : i32 +// CHECK: %[[DIM_LBITS:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 +// CHECK: %[[DIM_UBITS:.+]] = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 // CHECK: %[[EXTL:.+]] = arith.extui %[[DIM_LBITS]] : i32 to i64 // CHECK: %[[EXTU:.+]] = arith.extui %[[DIM_UBITS]] : i32 to i64 // CHECK: %[[SHIFTU:.+]] = arith.shli %[[EXTU]], %{{.*}} : i64 diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_scalar_dispatch.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_scalar_dispatch.mlir index ceae10c812c48..3cd1cc8c9480a 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_scalar_dispatch.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_scalar_dispatch.mlir @@ -1,7 +1,11 @@ // RUN: iree-opt --split-input-file --iree-gpu-test-target=pascal@vulkan --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-spirv-select-lowering-strategy-pass, func.func(iree-spirv-lower-executable-target-pass)))))' -mlir-print-local-scope %s | FileCheck %s -#pipeline_layout = #hal.pipeline.layout, <1, storage_buffer>]>]> - +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> hal.executable @scalar_dispatch { hal.executable.variant public @vulkan_spirv_fb target(#hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb">) { hal.executable.export public @scalar_dispatch ordinal(0) layout(#pipeline_layout) { @@ -14,8 +18,8 @@ hal.executable @scalar_dispatch { %c0 = arith.constant 0 : index %c6364136223846793005_i64 = arith.constant 6364136223846793005 : i64 %c1442695040888963407_i64 = arith.constant 1442695040888963407 : i64 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor %extracted = tensor.extract %2[] : tensor %3 = arith.muli %extracted, %c6364136223846793005_i64 : i64 @@ -30,8 +34,8 @@ hal.executable @scalar_dispatch { // CHECK: func.func @scalar_dispatch() // CHECK-SAME: translation_info = #iree_codegen.translation_info -// CHECK: %[[SPAN0:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK: %[[SPAN1:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[SPAN0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK: %[[SPAN1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK: memref.load %[[SPAN0]][] : memref> // CHECK: arith.muli {{.+}} : i64 // CHECK: arith.addi {{.+}} : i64 diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/physical_storage_buffer_addresses.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/physical_storage_buffer_addresses.mlir index fb61908f647ae..c008ecae5f39b 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/physical_storage_buffer_addresses.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/physical_storage_buffer_addresses.mlir @@ -21,9 +21,9 @@ hal.executable private @interface_binding { } { func.func @interface_binding() -> f32 { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<8x5xf32, #spirv.storage_class> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<5xf32, #spirv.storage_class> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<4x5xf32, #spirv.storage_class> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<8x5xf32, #spirv.storage_class> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<5xf32, #spirv.storage_class> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<4x5xf32, #spirv.storage_class> %3 = memref.load %0[%c0, %c0] : memref<8x5xf32, #spirv.storage_class> %4 = memref.load %1[%c0] : memref<5xf32, #spirv.storage_class> diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir index b7c9485cbbcf5..ed932b80a57a0 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir @@ -29,11 +29,11 @@ hal.executable public @matmul_256x1024x128_div_exp { %c1024 = arith.constant 1024 : index %c256 = arith.constant 256 : index %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) : !flow.dispatch.tensor> %11 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf16> %14 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<256x1024xf16> %17 = tensor.empty() : tensor<256x1024xf16> @@ -201,8 +201,7 @@ hal.executable public @matmul_256x1024x128_div_exp { #hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>, - #hal.descriptor_set.binding<3, storage_buffer>, - #hal.descriptor_set.binding<4, storage_buffer> + #hal.descriptor_set.binding<3, storage_buffer> ]> ]> hal.executable public @batch_matmul_16x128x256x512_div { @@ -216,10 +215,10 @@ hal.executable public @batch_matmul_16x128x256x512_div { func.func @batch_matmul_16x128x256x512_div() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> %4 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 128, 512], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x128x512xf16> %5 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [16, 512, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x512x256xf16> %6 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [16, 128, 256], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x128x256xf16> @@ -307,8 +306,7 @@ hal.executable public @batch_matmul_16x128x256x512_div { #hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>, - #hal.descriptor_set.binding<3, storage_buffer>, - #hal.descriptor_set.binding<4, storage_buffer> + #hal.descriptor_set.binding<3, storage_buffer> ]> ]> @@ -323,10 +321,10 @@ hal.executable public @matmul_32x32x32_div { func.func @matmul_32x32x32_div() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x32xf16> %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x32xf16> %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x32xf16> @@ -376,9 +374,9 @@ hal.executable public @generic_batch_matmul_32x128x512x64 { func.func @generic_batch_matmul_32x128x512x64() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [32, 128, 64], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<32x128x64xf16> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [64, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<64x512xf16> %5 = tensor.empty() : tensor<32x128x512xf16> diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_promotion.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_promotion.mlir index 37fbea0b02598..046c2891330ba 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_promotion.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_promotion.mlir @@ -21,10 +21,10 @@ hal.executable @matmul_f32_128x256x64 { func.func @matmul_f32_128x256x64() { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x512xf32> %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x256xf32> %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf32> @@ -96,10 +96,10 @@ hal.executable @matmul_f16_128x256x64 { func.func @matmul_f16_128x256x64() { %cst = arith.constant 0.0 : f16 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x512xf16> %5 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x256xf16> %6 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<128x256xf16> @@ -157,8 +157,7 @@ hal.executable @matmul_f16_128x256x64 { #hal.descriptor_set.layout<0, bindings = [ #hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, - #hal.descriptor_set.binding<2, storage_buffer>, - #hal.descriptor_set.binding<3, storage_buffer> + #hal.descriptor_set.binding<2, storage_buffer> ]> ]> @@ -177,9 +176,9 @@ hal.executable @matmul_f16_32x1280x1280 { func.func @matmul_f16_32x1280x1280() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x1280xf16> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1280, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1280x1280xf16> %5 = tensor.empty() : tensor<32x1280xf16> diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_vectorization.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_vectorization.mlir index 1abafddd7804a..f9a186cf95de2 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_vectorization.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_vectorization.mlir @@ -19,9 +19,9 @@ hal.executable private @fuse_and_vectorize_fill_matmul { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 %c4096 = arith.constant 4096 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %8 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x4096xf32> %10 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 4096], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x4096xf32> %15 = tensor.empty() : tensor<4096x4096xf32> @@ -65,10 +65,10 @@ hal.executable private @fuse_and_vectorize_matmul_add { %cst = arith.constant 0.000000e+00 : f32 %c1024 = arith.constant 1024 : index %c256 = arith.constant 256 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) : !flow.dispatch.tensor> %10 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 256], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x256xf32> %13 = tensor.empty() : tensor<1024x256xf32> %15 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x512xf32> diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matvec.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matvec.mlir index 34acbde451daa..aa5d7cb5ea784 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matvec.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matvec.mlir @@ -27,11 +27,11 @@ hal.executable @i4_dequant_unit_matmul_f16 { func.func @i4_dequant_unit_matmul_f16() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) alignment(64) offset(%c0) : !flow.dispatch.tensor> %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x128xi4> %6 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [4096, 86, 1], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x1xf16> %7 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [4096, 86, 1], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x1xf16> @@ -117,9 +117,7 @@ hal.executable @i4_dequant_unit_matmul_f16 { #hal.descriptor_set.layout<0, bindings = [ #hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, - #hal.descriptor_set.binding<2, storage_buffer>, - #hal.descriptor_set.binding<3, storage_buffer>, - #hal.descriptor_set.binding<4, storage_buffer> + #hal.descriptor_set.binding<2, storage_buffer> ]> ]> hal.executable @i4_dequant_matvec_f16_subgroup_64 { @@ -137,21 +135,21 @@ hal.executable @i4_dequant_matvec_f16_subgroup_64 { builtin.module { func.func @i4_dequant_matvec_f16_subgroup_64() { %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = hal.interface.constant.load[4] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 %5 = arith.index_castui %0 : i32 to index %6 = arith.index_castui %1 : i32 to index %7 = arith.index_castui %2 : i32 to index %8 = arith.index_castui %3 : i32 to index %9 = arith.index_castui %4 : i32 to index - %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor> - %11 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor> - %12 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%7) flags(ReadOnly) : !flow.dispatch.tensor> - %13 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor> - %14 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%9) : !flow.dispatch.tensor> + %10 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%5) flags(ReadOnly) : !flow.dispatch.tensor> + %11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%6) flags(ReadOnly) : !flow.dispatch.tensor> + %12 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%7) flags(ReadOnly) : !flow.dispatch.tensor> + %13 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%8) flags(ReadOnly) : !flow.dispatch.tensor> + %14 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%9) : !flow.dispatch.tensor> %15 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x128xi4> %16 = flow.dispatch.tensor.load %11, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf16> %17 = flow.dispatch.tensor.load %12, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf16> diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_reduction_subgroup.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_reduction_subgroup.mlir index 9c8825c6fbd2f..03a12aafb230f 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_reduction_subgroup.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_reduction_subgroup.mlir @@ -18,8 +18,8 @@ hal.executable private @subgroup_reduce { func.func @subgroup_reduce() { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2x512xf32> %3 = tensor.empty() : tensor<2xf32> %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<2xf32>) -> tensor<2xf32> diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_sub_byte_dequant.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_sub_byte_dequant.mlir index e995a934005d9..b2e8d5bba2e47 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_sub_byte_dequant.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_sub_byte_dequant.mlir @@ -18,10 +18,10 @@ hal.executable @i4_dequant { builtin.module { func.func @i4_dequant() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> %4 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [131072, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<131072x128xi4> %5 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor> -> tensor<131072xf32> %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [131072], strides = [1] : !flow.dispatch.tensor> -> tensor<131072xf32> diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir index 66b4d3cc4fdf3..331832e2fbd96 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/set_transform_strategy.mlir @@ -6,21 +6,26 @@ // core, but there are no such wmma intrinsics. Fix it to support fp16-input. // TODO: | FileCheck %s -module { - func.func @matmul() { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2052, 2556], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2052x2556xf32> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2556x2052xf32> - %5 = tensor.empty() : tensor<2052x2052xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32> - %7 = linalg.matmul ins(%3, %4 : tensor<2052x2556xf32>, tensor<2556x2052xf32>) outs(%6 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32> - flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2052, 2052], strides = [1, 1] : tensor<2052x2052xf32> -> !flow.dispatch.tensor> - return - } +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> +func.func @matmul() { + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2052, 2556], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2052x2556xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2556, 2052], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2556x2052xf32> + %5 = tensor.empty() : tensor<2052x2052xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<2052x2556xf32>, tensor<2556x2052xf32>) outs(%6 : tensor<2052x2052xf32>) -> tensor<2052x2052xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2052, 2052], strides = [1, 1] : tensor<2052x2052xf32> -> !flow.dispatch.tensor> + return } // CHECK-LABEL: func @matmul diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_distribute.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_distribute.mlir index b57bb12425802..27dbd92825cca 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_distribute.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_distribute.mlir @@ -10,7 +10,7 @@ #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info -#pipeline_layout = #hal.pipeline.layout, #hal.descriptor_set.binding<1, storage_buffer>, @@ -26,12 +26,12 @@ hal.executable private @matmul { builtin.module { func.func @matmul() { %c0 = arith.constant 0 : index - %M = hal.interface.constant.load[0] : index - %N = hal.interface.constant.load[1] : index - %K = hal.interface.constant.load[2] : index - %arg0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref{%M, %K} - %arg1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref{%K, %N} - %arg2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref{%M, %N} + %M = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %arg0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref{%M, %K} + %arg1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref{%K, %N} + %arg2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref{%M, %N} %c4 = arith.constant 4 : index %c1 = arith.constant 1 : index %0 = memref.dim %arg0, %c1 : memref @@ -97,9 +97,9 @@ hal.executable private @conv_1d { func.func @conv_1d() { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<3x6x1xf32> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<3x8x1xf32> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<3x1x1xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<3x6x1xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<3x8x1xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<3x1x1xf32> %3 = gpu.block_id x %4 = gpu.block_id y %5 = gpu.block_id z @@ -125,9 +125,9 @@ hal.executable private @conv_1d { } // CHECK-LABEL: func.func @conv_1d -// CHECK-DAG: %[[RET:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) +// CHECK-DAG: %[[RET:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) +// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-DAG: %[[ARG0SV1:.+]] = memref.subview %[[ARG0]] // CHECK-DAG: %[[ARG1SV1:.+]] = memref.subview %[[ARG1]] // CHECK-DAG: %[[RETSV1:.+]] = memref.subview %[[RET]] @@ -173,18 +173,18 @@ hal.executable private @conv_2d { builtin.module { func.func @conv_2d() { %c0 = arith.constant 0 : index - %n = hal.interface.constant.load[0] : index - %oh = hal.interface.constant.load[1] : index - %ow = hal.interface.constant.load[2] : index - %oc = hal.interface.constant.load[3] : index - %ih = hal.interface.constant.load[4] : index - %iw = hal.interface.constant.load[5] : index - %ic = hal.interface.constant.load[6] : index - %fh = hal.interface.constant.load[7] : index - %fw = hal.interface.constant.load[8] : index - %arg0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref{%n, %ih, %iw, %ic} - %arg1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref{%fh, %fw, %ic, %oc} - %arg2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref{%n, %oh, %ow, %oc} + %n = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %oh = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %ow = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %oc = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %ih = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : index + %iw = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : index + %ic = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : index + %fh = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : index + %fw = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : index + %arg0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref{%n, %ih, %iw, %ic} + %arg1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref{%fh, %fw, %ic, %oc} + %arg2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref{%n, %oh, %ow, %oc} %c2 = arith.constant 2 : index %c3 = arith.constant 3 : index %c1 = arith.constant 1 : index @@ -239,9 +239,9 @@ hal.executable private @conv_2d { // CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 * 4)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (s0 * 32)> // CHECK: func.func @conv_2d -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) +// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK-DAG: %[[C0:.+]] = arith.constant 0 // CHECK-DAG: %[[C1:.+]] = arith.constant 1 // CHECK-DAG: %[[C4:.+]] = arith.constant 4 @@ -289,9 +289,9 @@ hal.executable private @conv_3d { func.func @conv_3d() { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<2x7x7x7x2xf32> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<2x8x8x8x3xf32> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<2x2x2x3x2xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<2x7x7x7x2xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<2x8x8x8x3xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<2x2x2x3x2xf32> %3 = gpu.block_id x %4 = gpu.block_id y %5 = gpu.block_id z @@ -359,9 +359,9 @@ module { builtin.module { func.func @pooling_nhwc_max() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<2x16x16x6xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<3x4xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<2x14x13x6xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<2x16x16x6xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<3x4xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<2x14x13x6xf32> %3 = gpu.block_id x %4 = gpu.block_id y %5 = affine.apply #map0()[%4] @@ -385,9 +385,9 @@ module { // CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 * 4)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<()[s0] -> (s0 * 32)> // CHECK: func.func @pooling_nhwc_max -// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) +// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK: %[[SV1:.+]] = memref.subview %[[ARG0]] // CHECK: %[[SV2:.+]] = memref.subview %[[RET0]] // CHECK-DAG: %[[TIDX:.+]] = gpu.thread_id x @@ -428,9 +428,9 @@ hal.executable @matvec { %c250 = arith.constant 250 : index %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<250x1024xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1024xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<250xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<250x1024xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<1024xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<250xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %3 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_distribute_scatter.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_distribute_scatter.mlir index 93abfa0a7e96c..8ce2c91849110 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_distribute_scatter.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_distribute_scatter.mlir @@ -20,9 +20,9 @@ hal.executable private @static_scatter_update_slice { %c40 = arith.constant 40 : index %c500 = arith.constant 500 : index %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<40x500xi32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<40x1xi32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<100x500xi32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<40x500xi32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<40x1xi32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : memref<100x500xi32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index @@ -50,9 +50,9 @@ hal.executable private @static_scatter_update_slice { } // CHECK-LABEL: func.func @static_scatter_update_slice() -// CHECK: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) -// CHECK: %[[ARG1:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) -// CHECK: %[[ARG2:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) +// CHECK: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK: %[[ARG1:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) +// CHECK: %[[ARG2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // CHECK: scf.for // CHECK: scf.for // CHECK: %[[WG_UPDATE:.+]] = memref.subview %[[ARG0]] diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_distribute_sort.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_distribute_sort.mlir index fe2ddfd38e59d..42738b7f3c255 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_distribute_sort.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_distribute_sort.mlir @@ -16,7 +16,7 @@ hal.executable private @static_3d_sort { builtin.module { func.func @static_3d_sort() { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<64x32x128xi32, #hal.descriptor_type> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<64x32x128xi32, #hal.descriptor_type> memref.assume_alignment %0, 64 : memref<64x32x128xi32, #hal.descriptor_type> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index @@ -34,7 +34,7 @@ hal.executable private @static_3d_sort { } // CHECK-LABEL: func.func @static_3d_sort() -// CHECK: %[[ARG0:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) +// CHECK: %[[ARG0:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) // CHECK: %[[WG_OUTPUT:.+]] = memref.subview %[[ARG0]] // CHECK: %[[TID_X:.+]] = gpu.thread_id x // CHECK: %[[DIM_X:.+]] = gpu.block_dim x diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_promote_cooperative_matrix.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_promote_cooperative_matrix.mlir index 23b272cb40b67..daa7c7e5c8814 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_promote_cooperative_matrix.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_promote_cooperative_matrix.mlir @@ -8,50 +8,56 @@ // Single tile per workgroup means no subview ops for promotion. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 32)> #map1 = affine_map<(d0, d1) -> (d0, d1)> #translation = #iree_codegen.translation_info -module { - func.func @matmul_f16_32x32x32() attributes {translation_info = #translation} { - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<32x32xf16> - memref.assume_alignment %0, 64 : memref<32x32xf16> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<32x32xf16> - memref.assume_alignment %1, 64 : memref<32x32xf16> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x32xf16> - memref.assume_alignment %2, 64 : memref<32x32xf16> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : memref<32x32xf16> - memref.assume_alignment %3, 64 : memref<32x32xf16> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %4 = affine.apply #map()[%workgroup_id_y] - %5 = affine.apply #map()[%workgroup_count_y] - scf.for %arg0 = %4 to %c32 step %5 { - %6 = affine.apply #map()[%workgroup_id_x] - %7 = affine.apply #map()[%workgroup_count_x] - scf.for %arg1 = %6 to %c32 step %7 { - linalg.fill ins(%cst : f16) outs(%3 : memref<32x32xf16>) - linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<32x32xf16>, memref<32x32xf16>) outs(%3 : memref<32x32xf16>) - linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : memref<32x32xf16>) outs(%3 : memref<32x32xf16>) { - ^bb0(%in: f16, %out: f16): - %8 = arith.divf %out, %in : f16 - linalg.yield %8 : f16 - } +func.func @matmul_f16_32x32x32() attributes {translation_info = #translation} { + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<32x32xf16> + memref.assume_alignment %0, 64 : memref<32x32xf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<32x32xf16> + memref.assume_alignment %1, 64 : memref<32x32xf16> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x32xf16> + memref.assume_alignment %2, 64 : memref<32x32xf16> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : memref<32x32xf16> + memref.assume_alignment %3, 64 : memref<32x32xf16> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %4 = affine.apply #map()[%workgroup_id_y] + %5 = affine.apply #map()[%workgroup_count_y] + scf.for %arg0 = %4 to %c32 step %5 { + %6 = affine.apply #map()[%workgroup_id_x] + %7 = affine.apply #map()[%workgroup_count_x] + scf.for %arg1 = %6 to %c32 step %7 { + linalg.fill ins(%cst : f16) outs(%3 : memref<32x32xf16>) + linalg.matmul {lowering_config = #config} ins(%0, %1 : memref<32x32xf16>, memref<32x32xf16>) outs(%3 : memref<32x32xf16>) + linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%2 : memref<32x32xf16>) outs(%3 : memref<32x32xf16>) { + ^bb0(%in: f16, %out: f16): + %8 = arith.divf %out, %in : f16 + linalg.yield %8 : f16 } } - return } + return } // CHECK-LABEL: func.func @matmul_f16_32x32x32() -// CHECK: %[[LHS:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK: %[[RHS:.+]] = hal.interface.binding.subspan set(0) binding(1) +// CHECK: %[[LHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) +// CHECK: %[[RHS:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) // CHECK-NOT: memref.alloc() // CHECK-NOT: memref.copy @@ -63,6 +69,13 @@ module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 32)> #map1 = affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)> @@ -70,51 +83,49 @@ module { #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> #map4 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #translation = #iree_codegen.translation_info -module { - func.func @generic_batch_matmul_f16_32x128x512x64() attributes {translation_info = #translation} { - %c32 = arith.constant 32 : index - %c128 = arith.constant 128 : index - %c512 = arith.constant 512 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<128x32x64xf16> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<32x64x512xf16> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x128x512xf16> - %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x128x512xf16> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %workgroup_id_z = hal.interface.workgroup.id[2] : index - %workgroup_count_z = hal.interface.workgroup.count[2] : index - scf.for %arg0 = %workgroup_id_z to %c32 step %workgroup_count_z { - %4 = affine.apply #map()[%workgroup_id_y] - %5 = affine.apply #map()[%workgroup_count_y] - scf.for %arg1 = %4 to %c128 step %5 { - %6 = affine.apply #map()[%workgroup_id_x] - %7 = affine.apply #map()[%workgroup_count_x] - scf.for %arg2 = %6 to %c512 step %7 { - %subview = memref.subview %2[%arg0, %arg1, %arg2] [1, 32, 32] [1, 1, 1] : memref<32x128x512xf16> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>> - %subview_0 = memref.subview %0[%arg1, %arg0, 0] [32, 1, 64] [1, 1, 1] : memref<128x32x64xf16> to memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>> - %subview_1 = memref.subview %1[%arg0, 0, %arg2] [1, 64, 32] [1, 1, 1] : memref<32x64x512xf16> to memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>> - linalg.fill ins(%cst : f16) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) - linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%subview_0, %subview_1 : memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>>, memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>>) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) attrs = {lowering_config = #config} { - ^bb0(%in: f16, %in_3: f16, %out: f16): - %8 = arith.mulf %in, %in_3 : f16 - %9 = arith.addf %out, %8 : f16 - linalg.yield %9 : f16 - } - %subview_2 = memref.subview %3[%arg0, %arg1, %arg2] [1, 32, 32] [1, 1, 1] : memref<32x128x512xf16> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>> - linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%subview_2 : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) { - ^bb0(%in: f16, %out: f16): - %8 = math.exp %in : f16 - linalg.yield %8 : f16 - } +func.func @generic_batch_matmul_f16_32x128x512x64() attributes {translation_info = #translation} { + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<128x32x64xf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<32x64x512xf16> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x128x512xf16> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x128x512xf16> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %workgroup_id_z = hal.interface.workgroup.id[2] : index + %workgroup_count_z = hal.interface.workgroup.count[2] : index + scf.for %arg0 = %workgroup_id_z to %c32 step %workgroup_count_z { + %4 = affine.apply #map()[%workgroup_id_y] + %5 = affine.apply #map()[%workgroup_count_y] + scf.for %arg1 = %4 to %c128 step %5 { + %6 = affine.apply #map()[%workgroup_id_x] + %7 = affine.apply #map()[%workgroup_count_x] + scf.for %arg2 = %6 to %c512 step %7 { + %subview = memref.subview %2[%arg0, %arg1, %arg2] [1, 32, 32] [1, 1, 1] : memref<32x128x512xf16> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>> + %subview_0 = memref.subview %0[%arg1, %arg0, 0] [32, 1, 64] [1, 1, 1] : memref<128x32x64xf16> to memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>> + %subview_1 = memref.subview %1[%arg0, 0, %arg2] [1, 64, 32] [1, 1, 1] : memref<32x64x512xf16> to memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>> + linalg.fill ins(%cst : f16) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%subview_0, %subview_1 : memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>>, memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>>) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) attrs = {lowering_config = #config} { + ^bb0(%in: f16, %in_3: f16, %out: f16): + %8 = arith.mulf %in, %in_3 : f16 + %9 = arith.addf %out, %8 : f16 + linalg.yield %9 : f16 + } + %subview_2 = memref.subview %3[%arg0, %arg1, %arg2] [1, 32, 32] [1, 1, 1] : memref<32x128x512xf16> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>> + linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%subview_2 : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) { + ^bb0(%in: f16, %out: f16): + %8 = math.exp %in : f16 + linalg.yield %8 : f16 } } } - return } + return } // CHECK-LABEL: func.func @generic_batch_matmul_f16_32x128x512x64() @@ -172,6 +183,13 @@ module { // Cooperative matrix fusable elementwise ops do not need promote C. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 32)> #map1 = affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)> @@ -179,51 +197,49 @@ module { #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> #map4 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #translation = #iree_codegen.translation_info -module { - func.func @generic_batch_matmul_f16_32x128x512x64() attributes {translation_info = #translation} { - %c32 = arith.constant 32 : index - %c128 = arith.constant 128 : index - %c512 = arith.constant 512 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<128x32x64xf16> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<32x64x512xf16> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x128x512xf16> - %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x128x512xf16> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %workgroup_id_z = hal.interface.workgroup.id[2] : index - %workgroup_count_z = hal.interface.workgroup.count[2] : index - scf.for %arg0 = %workgroup_id_z to %c32 step %workgroup_count_z { - %4 = affine.apply #map()[%workgroup_id_y] - %5 = affine.apply #map()[%workgroup_count_y] - scf.for %arg1 = %4 to %c128 step %5 { - %6 = affine.apply #map()[%workgroup_id_x] - %7 = affine.apply #map()[%workgroup_count_x] - scf.for %arg2 = %6 to %c512 step %7 { - %subview = memref.subview %2[%arg0, %arg1, %arg2] [1, 32, 32] [1, 1, 1] : memref<32x128x512xf16> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>> - %subview_0 = memref.subview %0[%arg1, %arg0, 0] [32, 1, 64] [1, 1, 1] : memref<128x32x64xf16> to memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>> - %subview_1 = memref.subview %1[%arg0, 0, %arg2] [1, 64, 32] [1, 1, 1] : memref<32x64x512xf16> to memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>> - linalg.fill ins(%cst : f16) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) - linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%subview_0, %subview_1 : memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>>, memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>>) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) attrs = {lowering_config = #config} { - ^bb0(%in: f16, %in_3: f16, %out: f16): - %8 = arith.mulf %in, %in_3 : f16 - %9 = arith.addf %out, %8 : f16 - linalg.yield %9 : f16 - } - %subview_2 = memref.subview %3[%arg0, %arg1, %arg2] [1, 32, 32] [1, 1, 1] : memref<32x128x512xf16> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>> - linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%subview_2 : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) { - ^bb0(%in: f16, %out: f16): - %8 = arith.divf %out, %in : f16 - linalg.yield %8 : f16 - } +func.func @generic_batch_matmul_f16_32x128x512x64() attributes {translation_info = #translation} { + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<128x32x64xf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<32x64x512xf16> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x128x512xf16> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x128x512xf16> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %workgroup_id_z = hal.interface.workgroup.id[2] : index + %workgroup_count_z = hal.interface.workgroup.count[2] : index + scf.for %arg0 = %workgroup_id_z to %c32 step %workgroup_count_z { + %4 = affine.apply #map()[%workgroup_id_y] + %5 = affine.apply #map()[%workgroup_count_y] + scf.for %arg1 = %4 to %c128 step %5 { + %6 = affine.apply #map()[%workgroup_id_x] + %7 = affine.apply #map()[%workgroup_count_x] + scf.for %arg2 = %6 to %c512 step %7 { + %subview = memref.subview %2[%arg0, %arg1, %arg2] [1, 32, 32] [1, 1, 1] : memref<32x128x512xf16> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>> + %subview_0 = memref.subview %0[%arg1, %arg0, 0] [32, 1, 64] [1, 1, 1] : memref<128x32x64xf16> to memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>> + %subview_1 = memref.subview %1[%arg0, 0, %arg2] [1, 64, 32] [1, 1, 1] : memref<32x64x512xf16> to memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>> + linalg.fill ins(%cst : f16) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%subview_0, %subview_1 : memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>>, memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>>) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) attrs = {lowering_config = #config} { + ^bb0(%in: f16, %in_3: f16, %out: f16): + %8 = arith.mulf %in, %in_3 : f16 + %9 = arith.addf %out, %8 : f16 + linalg.yield %9 : f16 + } + %subview_2 = memref.subview %3[%arg0, %arg1, %arg2] [1, 32, 32] [1, 1, 1] : memref<32x128x512xf16> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>> + linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel"]} ins(%subview_2 : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) { + ^bb0(%in: f16, %out: f16): + %8 = arith.divf %out, %in : f16 + linalg.yield %8 : f16 } } } - return } + return } // PROMOTEC-LABEL: func.func @generic_batch_matmul_f16_32x128x512x64() @@ -251,50 +267,55 @@ module { // No need to promote C if there is no fused element wise ops. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 32)> #map1 = affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> #translation = #iree_codegen.translation_info -module { - func.func @generic_batch_matmul_f16_32x128x512x64() attributes {translation_info = #translation} { - %c32 = arith.constant 32 : index - %c128 = arith.constant 128 : index - %c512 = arith.constant 512 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<128x32x64xf16> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<32x64x512xf16> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x128x512xf16> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %workgroup_id_z = hal.interface.workgroup.id[2] : index - %workgroup_count_z = hal.interface.workgroup.count[2] : index - scf.for %arg0 = %workgroup_id_z to %c32 step %workgroup_count_z { - %3 = affine.apply #map()[%workgroup_id_y] - %4 = affine.apply #map()[%workgroup_count_y] - scf.for %arg1 = %3 to %c128 step %4 { - %5 = affine.apply #map()[%workgroup_id_x] - %6 = affine.apply #map()[%workgroup_count_x] - scf.for %arg2 = %5 to %c512 step %6 { - %subview = memref.subview %2[%arg0, %arg1, %arg2] [1, 32, 32] [1, 1, 1] : memref<32x128x512xf16> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>> - %subview_0 = memref.subview %0[%arg1, %arg0, 0] [32, 1, 64] [1, 1, 1] : memref<128x32x64xf16> to memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>> - %subview_1 = memref.subview %1[%arg0, 0, %arg2] [1, 64, 32] [1, 1, 1] : memref<32x64x512xf16> to memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>> - linalg.fill ins(%cst : f16) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) - linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%subview_0, %subview_1 : memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>>, memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>>) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) attrs = {lowering_config = #config} { - ^bb0(%in: f16, %in_2: f16, %out: f16): - %7 = arith.mulf %in, %in_2 : f16 - %8 = arith.addf %out, %7 : f16 - linalg.yield %8 : f16 - } +func.func @generic_batch_matmul_f16_32x128x512x64() attributes {translation_info = #translation} { + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + %c512 = arith.constant 512 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<128x32x64xf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<32x64x512xf16> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<32x128x512xf16> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %workgroup_id_z = hal.interface.workgroup.id[2] : index + %workgroup_count_z = hal.interface.workgroup.count[2] : index + scf.for %arg0 = %workgroup_id_z to %c32 step %workgroup_count_z { + %3 = affine.apply #map()[%workgroup_id_y] + %4 = affine.apply #map()[%workgroup_count_y] + scf.for %arg1 = %3 to %c128 step %4 { + %5 = affine.apply #map()[%workgroup_id_x] + %6 = affine.apply #map()[%workgroup_count_x] + scf.for %arg2 = %5 to %c512 step %6 { + %subview = memref.subview %2[%arg0, %arg1, %arg2] [1, 32, 32] [1, 1, 1] : memref<32x128x512xf16> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>> + %subview_0 = memref.subview %0[%arg1, %arg0, 0] [32, 1, 64] [1, 1, 1] : memref<128x32x64xf16> to memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>> + %subview_1 = memref.subview %1[%arg0, 0, %arg2] [1, 64, 32] [1, 1, 1] : memref<32x64x512xf16> to memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>> + linalg.fill ins(%cst : f16) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%subview_0, %subview_1 : memref<32x1x64xf16, strided<[2048, 64, 1], offset: ?>>, memref<1x64x32xf16, strided<[32768, 512, 1], offset: ?>>) outs(%subview : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>>) attrs = {lowering_config = #config} { + ^bb0(%in: f16, %in_2: f16, %out: f16): + %7 = arith.mulf %in, %in_2 : f16 + %8 = arith.addf %out, %7 : f16 + linalg.yield %8 : f16 } } } - return } + return } // PROMOTEC-LABEL: func.func @generic_batch_matmul_f16_32x128x512x64() @@ -304,7 +325,7 @@ module { // PROMOTEC: %[[LHS_ALLOC:.+]] = memref.alloc() : memref<32x1x32xf16, #gpu.address_space> // PROMOTEC-NOT: memref.alloc() -// PROMOTEC: %[[SPAN2:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) +// PROMOTEC: %[[SPAN2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) // PROMOTEC: %[[OUT_VIEW:.+]] = memref.subview %[[SPAN2]] // PROMOTEC: linalg.fill @@ -331,44 +352,49 @@ module { // No need to promote again with allocations from bufferization. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 64)> #map1 = affine_map<()[s0] -> (s0 * 128)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #translation = #iree_codegen.translation_info -module { - func.func @batch_matmul_f16_1x64x128x512() attributes {translation_info = #translation} { - %c4096 = arith.constant 4096 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<1x4096x512xf16> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1x512x4096xf16> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<1x4096x4096xf32> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %3 = affine.apply #map()[%workgroup_id_y] - %4 = affine.apply #map()[%workgroup_count_y] - scf.for %arg0 = %3 to %c4096 step %4 { - %5 = affine.apply #map1()[%workgroup_id_x] - %6 = affine.apply #map1()[%workgroup_count_x] - scf.for %arg1 = %5 to %c4096 step %6 { - %subview = memref.subview %2[0, %arg0, %arg1] [1, 64, 128] [1, 1, 1] : memref<1x4096x4096xf32> to memref<1x64x128xf32, strided<[16777216, 4096, 1], offset: ?>> - %subview_0 = memref.subview %0[0, %arg0, 0] [1, 64, 512] [1, 1, 1] : memref<1x4096x512xf16> to memref<1x64x512xf16, strided<[2097152, 512, 1], offset: ?>> - %subview_1 = memref.subview %1[0, 0, %arg1] [1, 512, 128] [1, 1, 1] : memref<1x512x4096xf16> to memref<1x512x128xf16, strided<[2097152, 4096, 1], offset: ?>> - %alloc = memref.alloc() {alignment = 128 : i64} : memref<1x64x128xf16, #gpu.address_space> - linalg.fill ins(%cst : f16) outs(%alloc : memref<1x64x128xf16, #gpu.address_space>) - linalg.batch_matmul {lowering_config = #config} ins(%subview_0, %subview_1 : memref<1x64x512xf16, strided<[2097152, 512, 1], offset: ?>>, memref<1x512x128xf16, strided<[2097152, 4096, 1], offset: ?>>) outs(%alloc : memref<1x64x128xf16, #gpu.address_space>) - linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%alloc : memref<1x64x128xf16, #gpu.address_space>) outs(%subview : memref<1x64x128xf32, strided<[16777216, 4096, 1], offset: ?>>) { - ^bb0(%in: f16, %out: f32): - %7 = arith.extf %in : f16 to f32 - linalg.yield %7 : f32 - } +func.func @batch_matmul_f16_1x64x128x512() attributes {translation_info = #translation} { + %c4096 = arith.constant 4096 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<1x4096x512xf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<1x512x4096xf16> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<1x4096x4096xf32> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %3 = affine.apply #map()[%workgroup_id_y] + %4 = affine.apply #map()[%workgroup_count_y] + scf.for %arg0 = %3 to %c4096 step %4 { + %5 = affine.apply #map1()[%workgroup_id_x] + %6 = affine.apply #map1()[%workgroup_count_x] + scf.for %arg1 = %5 to %c4096 step %6 { + %subview = memref.subview %2[0, %arg0, %arg1] [1, 64, 128] [1, 1, 1] : memref<1x4096x4096xf32> to memref<1x64x128xf32, strided<[16777216, 4096, 1], offset: ?>> + %subview_0 = memref.subview %0[0, %arg0, 0] [1, 64, 512] [1, 1, 1] : memref<1x4096x512xf16> to memref<1x64x512xf16, strided<[2097152, 512, 1], offset: ?>> + %subview_1 = memref.subview %1[0, 0, %arg1] [1, 512, 128] [1, 1, 1] : memref<1x512x4096xf16> to memref<1x512x128xf16, strided<[2097152, 4096, 1], offset: ?>> + %alloc = memref.alloc() {alignment = 128 : i64} : memref<1x64x128xf16, #gpu.address_space> + linalg.fill ins(%cst : f16) outs(%alloc : memref<1x64x128xf16, #gpu.address_space>) + linalg.batch_matmul {lowering_config = #config} ins(%subview_0, %subview_1 : memref<1x64x512xf16, strided<[2097152, 512, 1], offset: ?>>, memref<1x512x128xf16, strided<[2097152, 4096, 1], offset: ?>>) outs(%alloc : memref<1x64x128xf16, #gpu.address_space>) + linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel", "parallel"]} ins(%alloc : memref<1x64x128xf16, #gpu.address_space>) outs(%subview : memref<1x64x128xf32, strided<[16777216, 4096, 1], offset: ?>>) { + ^bb0(%in: f16, %out: f32): + %7 = arith.extf %in : f16 to f32 + linalg.yield %7 : f32 } } - return } + return } // PROMOTEC-LABEL: func.func @batch_matmul_f16_1x64x128x512() @@ -402,47 +428,54 @@ module { // PROMOTEC: gpu.barrier // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 64)> #map1 = affine_map<()[s0] -> (s0 * 128)> #map2 = affine_map<(d0, d1) -> (d1)> #map3 = affine_map<(d0, d1) -> (d0, d1)> #translation = #iree_codegen.translation_info -module { - func.func @matmul_f16_f512x4096x64() attributes {translation_info = #translation} { - %c512 = arith.constant 512 : index - %c4096 = arith.constant 4096 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<512x64xf16> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<64x4096xf16> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<4096xf16> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : memref<512x4096xf16> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %4 = affine.apply #map()[%workgroup_id_y] - %5 = affine.apply #map()[%workgroup_count_y] - scf.for %arg0 = %4 to %c512 step %5 { - %6 = affine.apply #map1()[%workgroup_id_x] - %7 = affine.apply #map1()[%workgroup_count_x] - scf.for %arg1 = %6 to %c4096 step %7 { - %subview = memref.subview %3[%arg0, %arg1] [64, 128] [1, 1] : memref<512x4096xf16> to memref<64x128xf16, strided<[4096, 1], offset: ?>> - %subview_0 = memref.subview %0[%arg0, 0] [64, 64] [1, 1] : memref<512x64xf16> to memref<64x64xf16, strided<[64, 1], offset: ?>> - %subview_1 = memref.subview %1[0, %arg1] [64, 128] [1, 1] : memref<64x4096xf16> to memref<64x128xf16, strided<[4096, 1], offset: ?>> - linalg.fill ins(%cst : f16) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>) - linalg.matmul {lowering_config = #config} ins(%subview_0, %subview_1 : memref<64x64xf16, strided<[64, 1], offset: ?>>, memref<64x128xf16, strided<[4096, 1], offset: ?>>) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>) - %subview_2 = memref.subview %2[%arg1] [128] [1] : memref<4096xf16> to memref<128xf16, strided<[1], offset: ?>> - linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%subview_2 : memref<128xf16, strided<[1], offset: ?>>) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>) { - ^bb0(%in: f16, %out: f16): - %8 = arith.addf %out, %in : f16 - linalg.yield %8 : f16 - } +func.func @matmul_f16_f512x4096x64() attributes {translation_info = #translation} { + %c512 = arith.constant 512 : index + %c4096 = arith.constant 4096 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<512x64xf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<64x4096xf16> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<4096xf16> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : memref<512x4096xf16> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %4 = affine.apply #map()[%workgroup_id_y] + %5 = affine.apply #map()[%workgroup_count_y] + scf.for %arg0 = %4 to %c512 step %5 { + %6 = affine.apply #map1()[%workgroup_id_x] + %7 = affine.apply #map1()[%workgroup_count_x] + scf.for %arg1 = %6 to %c4096 step %7 { + %subview = memref.subview %3[%arg0, %arg1] [64, 128] [1, 1] : memref<512x4096xf16> to memref<64x128xf16, strided<[4096, 1], offset: ?>> + %subview_0 = memref.subview %0[%arg0, 0] [64, 64] [1, 1] : memref<512x64xf16> to memref<64x64xf16, strided<[64, 1], offset: ?>> + %subview_1 = memref.subview %1[0, %arg1] [64, 128] [1, 1] : memref<64x4096xf16> to memref<64x128xf16, strided<[4096, 1], offset: ?>> + linalg.fill ins(%cst : f16) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>) + linalg.matmul {lowering_config = #config} ins(%subview_0, %subview_1 : memref<64x64xf16, strided<[64, 1], offset: ?>>, memref<64x128xf16, strided<[4096, 1], offset: ?>>) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>) + %subview_2 = memref.subview %2[%arg1] [128] [1] : memref<4096xf16> to memref<128xf16, strided<[1], offset: ?>> + linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%subview_2 : memref<128xf16, strided<[1], offset: ?>>) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>) { + ^bb0(%in: f16, %out: f16): + %8 = arith.addf %out, %in : f16 + linalg.yield %8 : f16 } } - return } + return } // PROMOTEC-LABEL: func.func @matmul_f16_f512x4096x64() @@ -452,8 +485,8 @@ module { // PROMOTEC-DAG: %[[RHS_ALLOC:.+]] = memref.alloc() : memref<32x128xf16, #gpu.address_space> // PROMOTEC-NOT: memref.alloc() -// PROMOTEC: %[[SPAN2:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) -// PROMOTEC: %[[SPAN3:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) +// PROMOTEC: %[[SPAN2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) +// PROMOTEC: %[[SPAN3:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) // PROMOTEC: %[[OUT_VIEW:.+]] = memref.subview %[[SPAN3]] // PROMOTEC: linalg.fill @@ -487,47 +520,53 @@ module { // Transposed+broadcasted elementwise ops does not need promoting C matrix. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 64)> #map1 = affine_map<()[s0] -> (s0 * 128)> #map2 = affine_map<(d0, d1) -> (d0)> #map3 = affine_map<(d0, d1) -> (d0, d1)> #translation = #iree_codegen.translation_info -module { - func.func @matmul_f16_f512x4096x64() attributes {translation_info = #translation} { - %c512 = arith.constant 512 : index - %c4096 = arith.constant 4096 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<512x64xf16> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<64x4096xf16> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<512xf16> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : memref<512x4096xf16> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %4 = affine.apply #map()[%workgroup_id_y] - %5 = affine.apply #map()[%workgroup_count_y] - scf.for %arg0 = %4 to %c512 step %5 { - %6 = affine.apply #map1()[%workgroup_id_x] - %7 = affine.apply #map1()[%workgroup_count_x] - scf.for %arg1 = %6 to %c4096 step %7 { - %subview = memref.subview %3[%arg0, %arg1] [64, 128] [1, 1] : memref<512x4096xf16> to memref<64x128xf16, strided<[4096, 1], offset: ?>> - %subview_0 = memref.subview %0[%arg0, 0] [64, 64] [1, 1] : memref<512x64xf16> to memref<64x64xf16, strided<[64, 1], offset: ?>> - %subview_1 = memref.subview %1[0, %arg1] [64, 128] [1, 1] : memref<64x4096xf16> to memref<64x128xf16, strided<[4096, 1], offset: ?>> - linalg.fill ins(%cst : f16) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>) - linalg.matmul {lowering_config = #config} ins(%subview_0, %subview_1 : memref<64x64xf16, strided<[64, 1], offset: ?>>, memref<64x128xf16, strided<[4096, 1], offset: ?>>) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>) - %subview_2 = memref.subview %2[%arg0] [64] [1] : memref<512xf16> to memref<64xf16, strided<[1], offset: ?>> - linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%subview_2 : memref<64xf16, strided<[1], offset: ?>>) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>) { - ^bb0(%in: f16, %out: f16): - %8 = arith.addf %out, %in : f16 - linalg.yield %8 : f16 - } +func.func @matmul_f16_f512x4096x64() attributes {translation_info = #translation} { + %c512 = arith.constant 512 : index + %c4096 = arith.constant 4096 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<512x64xf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<64x4096xf16> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<512xf16> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : memref<512x4096xf16> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %4 = affine.apply #map()[%workgroup_id_y] + %5 = affine.apply #map()[%workgroup_count_y] + scf.for %arg0 = %4 to %c512 step %5 { + %6 = affine.apply #map1()[%workgroup_id_x] + %7 = affine.apply #map1()[%workgroup_count_x] + scf.for %arg1 = %6 to %c4096 step %7 { + %subview = memref.subview %3[%arg0, %arg1] [64, 128] [1, 1] : memref<512x4096xf16> to memref<64x128xf16, strided<[4096, 1], offset: ?>> + %subview_0 = memref.subview %0[%arg0, 0] [64, 64] [1, 1] : memref<512x64xf16> to memref<64x64xf16, strided<[64, 1], offset: ?>> + %subview_1 = memref.subview %1[0, %arg1] [64, 128] [1, 1] : memref<64x4096xf16> to memref<64x128xf16, strided<[4096, 1], offset: ?>> + linalg.fill ins(%cst : f16) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>) + linalg.matmul {lowering_config = #config} ins(%subview_0, %subview_1 : memref<64x64xf16, strided<[64, 1], offset: ?>>, memref<64x128xf16, strided<[4096, 1], offset: ?>>) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>) + %subview_2 = memref.subview %2[%arg0] [64] [1] : memref<512xf16> to memref<64xf16, strided<[1], offset: ?>> + linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%subview_2 : memref<64xf16, strided<[1], offset: ?>>) outs(%subview : memref<64x128xf16, strided<[4096, 1], offset: ?>>) { + ^bb0(%in: f16, %out: f16): + %8 = arith.addf %out, %in : f16 + linalg.yield %8 : f16 } } - return } + return } // PROMOTEC-LABEL: func.func @matmul_f16_f512x4096x64() @@ -537,8 +576,8 @@ module { // PROMOTEC-DAG: %[[RHS_ALLOC:.+]] = memref.alloc() : memref<32x128xf16, #gpu.address_space> // PROMOTEC-NOT: memref.alloc() -// PROMOTEC: %[[SPAN2:.+]] = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) -// PROMOTEC: %[[SPAN3:.+]] = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) +// PROMOTEC: %[[SPAN2:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) +// PROMOTEC: %[[SPAN3:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) // PROMOTEC: %[[OUT_VIEW:.+]] = memref.subview %[[SPAN3]] // PROMOTEC: linalg.fill @@ -572,50 +611,55 @@ module { // Inlined large constant array needs promoting C matrix. +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 64)> #map1 = affine_map<()[s0] -> (s0 * 128)> #map2 = affine_map<(d0, d1) -> (d0)> #map3 = affine_map<(d0, d1) -> (d0, d1)> #translation = #iree_codegen.translation_info -module { - func.func @matmul_f16_128x262144x2304() attributes {translation_info = #translation} { - %c128 = arith.constant 128 : index - %c262144 = arith.constant 262144 : index - %c96565312 = arith.constant 96565312 : index - %c806357120 = arith.constant 806357120 : index - %c134217728 = arith.constant 134217728 : index - %cst = arith.constant 0.000000e+00 : f16 - %cst_0 = arith.constant dense<"0x69222B2E40A3002A45AC1AAB2E2E202DA21C212680264C2A102314A041A7D029CB28352E5BAAD3B02F299D9A142B8AA1D1285C28412B25AF9A24EE2BA22C242D53AD9E2948A9289FCF301D28012F08AD68A6DD20ECAC912465290B2E9420C5AA50A222A912AB9526B62ADA2039AD4D912C9FDD287B20B224D329BA2A4D2C41A76DAB7E30B027F62ED1A0F1273A2BAE9D0FA48029812992A65AA92A2C9C2EE9A744A4632C5FA8A9A4CF2D70A482A0F5A2DBA7B6304B9D22A52B1B9DA8E424722AB5ACD0248A2B8B29C82D782E402D1A99F0A60CA4DE2DD32815266F2A6B247FA6FE214E2853AA402390AB6925F1A339307F2664A23CACBE28BA2B3D286DB0BA2E"> : tensor<128xf16> - %0 = bufferization.to_memref %cst_0 : memref<128xf16> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c96565312) : memref<128x2304xf16> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c806357120) : memref<2304x262144xf16> - %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c134217728) : memref<128x262144xf16> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %4 = affine.apply #map()[%workgroup_id_y] - %5 = affine.apply #map()[%workgroup_count_y] - scf.for %arg0 = %4 to %c128 step %5 { - %6 = affine.apply #map1()[%workgroup_id_x] - %7 = affine.apply #map1()[%workgroup_count_x] - scf.for %arg1 = %6 to %c262144 step %7 { - %subview = memref.subview %3[%arg0, %arg1] [64, 128] [1, 1] : memref<128x262144xf16> to memref<64x128xf16, strided<[262144, 1], offset: ?>> - %subview_1 = memref.subview %1[%arg0, 0] [64, 2304] [1, 1] : memref<128x2304xf16> to memref<64x2304xf16, strided<[2304, 1], offset: ?>> - %subview_2 = memref.subview %2[0, %arg1] [2304, 128] [1, 1] : memref<2304x262144xf16> to memref<2304x128xf16, strided<[262144, 1], offset: ?>> - linalg.fill ins(%cst : f16) outs(%subview : memref<64x128xf16, strided<[262144, 1], offset: ?>>) - linalg.matmul {lowering_config = #config} ins(%subview_1, %subview_2 : memref<64x2304xf16, strided<[2304, 1], offset: ?>>, memref<2304x128xf16, strided<[262144, 1], offset: ?>>) outs(%subview : memref<64x128xf16, strided<[262144, 1], offset: ?>>) - %subview_3 = memref.subview %0[%arg0] [64] [1] : memref<128xf16> to memref<64xf16, strided<[1], offset: ?>> - linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%subview_3 : memref<64xf16, strided<[1], offset: ?>>) outs(%subview : memref<64x128xf16, strided<[262144, 1], offset: ?>>) { - ^bb0(%in: f16, %out: f16): - %8 = arith.addf %out, %in : f16 - linalg.yield %8 : f16 - } +func.func @matmul_f16_128x262144x2304() attributes {translation_info = #translation} { + %c128 = arith.constant 128 : index + %c262144 = arith.constant 262144 : index + %c96565312 = arith.constant 96565312 : index + %c806357120 = arith.constant 806357120 : index + %c134217728 = arith.constant 134217728 : index + %cst = arith.constant 0.000000e+00 : f16 + %cst_0 = arith.constant dense<"0x69222B2E40A3002A45AC1AAB2E2E202DA21C212680264C2A102314A041A7D029CB28352E5BAAD3B02F299D9A142B8AA1D1285C28412B25AF9A24EE2BA22C242D53AD9E2948A9289FCF301D28012F08AD68A6DD20ECAC912465290B2E9420C5AA50A222A912AB9526B62ADA2039AD4D912C9FDD287B20B224D329BA2A4D2C41A76DAB7E30B027F62ED1A0F1273A2BAE9D0FA48029812992A65AA92A2C9C2EE9A744A4632C5FA8A9A4CF2D70A482A0F5A2DBA7B6304B9D22A52B1B9DA8E424722AB5ACD0248A2B8B29C82D782E402D1A99F0A60CA4DE2DD32815266F2A6B247FA6FE214E2853AA402390AB6925F1A339307F2664A23CACBE28BA2B3D286DB0BA2E"> : tensor<128xf16> + %0 = bufferization.to_memref %cst_0 : memref<128xf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c96565312) : memref<128x2304xf16> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c806357120) : memref<2304x262144xf16> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c134217728) : memref<128x262144xf16> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %4 = affine.apply #map()[%workgroup_id_y] + %5 = affine.apply #map()[%workgroup_count_y] + scf.for %arg0 = %4 to %c128 step %5 { + %6 = affine.apply #map1()[%workgroup_id_x] + %7 = affine.apply #map1()[%workgroup_count_x] + scf.for %arg1 = %6 to %c262144 step %7 { + %subview = memref.subview %3[%arg0, %arg1] [64, 128] [1, 1] : memref<128x262144xf16> to memref<64x128xf16, strided<[262144, 1], offset: ?>> + %subview_1 = memref.subview %1[%arg0, 0] [64, 2304] [1, 1] : memref<128x2304xf16> to memref<64x2304xf16, strided<[2304, 1], offset: ?>> + %subview_2 = memref.subview %2[0, %arg1] [2304, 128] [1, 1] : memref<2304x262144xf16> to memref<2304x128xf16, strided<[262144, 1], offset: ?>> + linalg.fill ins(%cst : f16) outs(%subview : memref<64x128xf16, strided<[262144, 1], offset: ?>>) + linalg.matmul {lowering_config = #config} ins(%subview_1, %subview_2 : memref<64x2304xf16, strided<[2304, 1], offset: ?>>, memref<2304x128xf16, strided<[262144, 1], offset: ?>>) outs(%subview : memref<64x128xf16, strided<[262144, 1], offset: ?>>) + %subview_3 = memref.subview %0[%arg0] [64] [1] : memref<128xf16> to memref<64xf16, strided<[1], offset: ?>> + linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%subview_3 : memref<64xf16, strided<[1], offset: ?>>) outs(%subview : memref<64x128xf16, strided<[262144, 1], offset: ?>>) { + ^bb0(%in: f16, %out: f16): + %8 = arith.addf %out, %in : f16 + linalg.yield %8 : f16 } } - return } + return } // PROMOTEC-LABEL: func.func @matmul_f16_128x262144x2304() diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_promote_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_promote_matmul.mlir index c9a628973f9a9..36510aeb94a22 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_promote_matmul.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_promote_matmul.mlir @@ -1,46 +1,52 @@ // RUN: iree-opt --split-input-file --mlir-print-local-scope --iree-gpu-test-target=pascal@vulkan --pass-pipeline='builtin.module(func.func(iree-spirv-tile-and-promote, cse))' %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 128)> #map1 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)> #map2 = affine_map<(d0, d1)[s0] -> (d0 * 128 + s0 + d1)> #map3 = affine_map<(d0, d1) -> (d0, d1)> #translation = #iree_codegen.translation_info -module { - func.func @matmul_f32_256x1024x128() attributes {translation_info = #translation} { - %c1024 = arith.constant 1024 : index - %c256 = arith.constant 256 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<256x128xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<128x1024xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<256x1024xf32> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : memref<256x1024xf32> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %4 = affine.apply #map()[%workgroup_id_y] - %5 = affine.apply #map()[%workgroup_count_y] - scf.for %arg0 = %4 to %c256 step %5 { - %6 = affine.apply #map()[%workgroup_id_x] - %7 = affine.apply #map()[%workgroup_count_x] - scf.for %arg1 = %6 to %c1024 step %7 { - %subview = memref.subview %2[%arg0, %arg1] [128, 128] [1, 1] : memref<256x1024xf32> to memref<128x128xf32, #map1> - %subview_0 = memref.subview %0[%arg0, 0] [128, 128] [1, 1] : memref<256x128xf32> to memref<128x128xf32, #map2> - %subview_1 = memref.subview %1[0, %arg1] [128, 128] [1, 1] : memref<128x1024xf32> to memref<128x128xf32, #map1> - %subview_2 = memref.subview %3[%arg0, %arg1] [128, 128] [1, 1] : memref<256x1024xf32> to memref<128x128xf32, #map1> - linalg.fill ins(%cst : f32) outs(%subview_2 : memref<128x128xf32, #map1>) - linalg.matmul {lowering_config = #config} ins(%subview_0, %subview_1 : memref<128x128xf32, #map2>, memref<128x128xf32, #map1>) outs(%subview_2 : memref<128x128xf32, #map1>) - linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%subview_2, %subview : memref<128x128xf32, #map1>, memref<128x128xf32, #map1>) outs(%subview_2 : memref<128x128xf32, #map1>) { - ^bb0(%in: f32, %in_3: f32, %out: f32): - %8 = arith.divf %in, %in_3 : f32 - linalg.yield %8 : f32 - } +func.func @matmul_f32_256x1024x128() attributes {translation_info = #translation} { + %c1024 = arith.constant 1024 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<256x128xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<128x1024xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<256x1024xf32> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : memref<256x1024xf32> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %4 = affine.apply #map()[%workgroup_id_y] + %5 = affine.apply #map()[%workgroup_count_y] + scf.for %arg0 = %4 to %c256 step %5 { + %6 = affine.apply #map()[%workgroup_id_x] + %7 = affine.apply #map()[%workgroup_count_x] + scf.for %arg1 = %6 to %c1024 step %7 { + %subview = memref.subview %2[%arg0, %arg1] [128, 128] [1, 1] : memref<256x1024xf32> to memref<128x128xf32, #map1> + %subview_0 = memref.subview %0[%arg0, 0] [128, 128] [1, 1] : memref<256x128xf32> to memref<128x128xf32, #map2> + %subview_1 = memref.subview %1[0, %arg1] [128, 128] [1, 1] : memref<128x1024xf32> to memref<128x128xf32, #map1> + %subview_2 = memref.subview %3[%arg0, %arg1] [128, 128] [1, 1] : memref<256x1024xf32> to memref<128x128xf32, #map1> + linalg.fill ins(%cst : f32) outs(%subview_2 : memref<128x128xf32, #map1>) + linalg.matmul {lowering_config = #config} ins(%subview_0, %subview_1 : memref<128x128xf32, #map2>, memref<128x128xf32, #map1>) outs(%subview_2 : memref<128x128xf32, #map1>) + linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%subview_2, %subview : memref<128x128xf32, #map1>, memref<128x128xf32, #map1>) outs(%subview_2 : memref<128x128xf32, #map1>) { + ^bb0(%in: f32, %in_3: f32, %out: f32): + %8 = arith.divf %in, %in_3 : f32 + linalg.yield %8 : f32 } } - return } + return } // CHECK-LABEL: func.func @matmul_f32_256x1024x128() @@ -52,10 +58,10 @@ module { // CHECK-DAG: %[[MEM_A:.+]] = memref.alloc() : memref<128x32xf32, #gpu.address_space> // CHECK-DAG: %[[MEM_B:.+]] = memref.alloc() : memref<32x128xf32, #gpu.address_space> -// CHECK-DAG: %[[BUFFER_A:.+]] = hal.interface.binding.subspan set(0) binding(0) {{.+}} : memref<256x128xf32> -// CHECK-DAG: %[[BUFFER_B:.+]] = hal.interface.binding.subspan set(0) binding(1) {{.+}} : memref<128x1024xf32> -// CHECK-DAG: %[[BUFFER_C:.+]] = hal.interface.binding.subspan set(0) binding(3) {{.+}} : memref<256x1024xf32> -// CHECK-DAG: %[[BUFFER_D:.+]] = hal.interface.binding.subspan set(0) binding(2) {{.+}} : memref<256x1024xf32> +// CHECK-DAG: %[[BUFFER_A:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) {{.+}} : memref<256x128xf32> +// CHECK-DAG: %[[BUFFER_B:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) {{.+}} : memref<128x1024xf32> +// CHECK-DAG: %[[BUFFER_C:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(3) {{.+}} : memref<256x1024xf32> +// CHECK-DAG: %[[BUFFER_D:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(2) {{.+}} : memref<256x1024xf32> // CHECK: scf.for // CHECK: scf.for @@ -105,50 +111,56 @@ module { // CHECK-SAME: outs(%[[VIEW_C]] // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 64)> #map1 = affine_map<()[s0] -> (s0 * 256)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #translation = #iree_codegen.translation_info -module { - func.func @batch_matmul_16x1024x1024x80() attributes {translation_info = #translation} { - %c0 = arith.constant 0 : index - %c16 = arith.constant 16 : index - %c1024 = arith.constant 1024 : index - %cst = arith.constant 0.111803398 : f32 - %cst_0 = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<16x1024x80xf16> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<16x80x1024xf16> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<16x1024x1024xf16> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %workgroup_id_z = hal.interface.workgroup.id[2] : index - %workgroup_count_z = hal.interface.workgroup.count[2] : index - scf.for %arg0 = %workgroup_id_z to %c16 step %workgroup_count_z { - %3 = affine.apply #map()[%workgroup_id_y] - %4 = affine.apply #map()[%workgroup_count_y] - scf.for %arg1 = %3 to %c1024 step %4 { - %5 = affine.apply #map1()[%workgroup_id_x] - %6 = affine.apply #map1()[%workgroup_count_x] - scf.for %arg2 = %5 to %c1024 step %6 { - %subview = memref.subview %2[%arg0, %arg1, %arg2] [1, 64, 256] [1, 1, 1] : memref<16x1024x1024xf16> to memref<1x64x256xf16, strided<[1048576, 1024, 1], offset: ?>> - %subview_1 = memref.subview %0[%arg0, %arg1, 0] [1, 64, 80] [1, 1, 1] : memref<16x1024x80xf16> to memref<1x64x80xf16, strided<[81920, 80, 1], offset: ?>> - %subview_2 = memref.subview %1[%arg0, 0, %arg2] [1, 80, 256] [1, 1, 1] : memref<16x80x1024xf16> to memref<1x80x256xf16, strided<[81920, 1024, 1], offset: ?>> - linalg.fill ins(%cst_0 : f16) outs(%subview : memref<1x64x256xf16, strided<[1048576, 1024, 1], offset: ?>>) - linalg.batch_matmul {lowering_config = #config} ins(%subview_1, %subview_2 : memref<1x64x80xf16, strided<[81920, 80, 1], offset: ?>>, memref<1x80x256xf16, strided<[81920, 1024, 1], offset: ?>>) outs(%subview : memref<1x64x256xf16, strided<[1048576, 1024, 1], offset: ?>>) - linalg.generic {indexing_maps = [#map2], iterator_types = ["parallel", "parallel", "parallel"]} outs(%subview : memref<1x64x256xf16, strided<[1048576, 1024, 1], offset: ?>>) { - ^bb0(%out: f16): - %7 = arith.truncf %cst : f32 to f16 - %8 = arith.mulf %out, %7 : f16 - linalg.yield %8 : f16 - } +func.func @batch_matmul_16x1024x1024x80() attributes {translation_info = #translation} { + %c0 = arith.constant 0 : index + %c16 = arith.constant 16 : index + %c1024 = arith.constant 1024 : index + %cst = arith.constant 0.111803398 : f32 + %cst_0 = arith.constant 0.000000e+00 : f16 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<16x1024x80xf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<16x80x1024xf16> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<16x1024x1024xf16> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %workgroup_id_z = hal.interface.workgroup.id[2] : index + %workgroup_count_z = hal.interface.workgroup.count[2] : index + scf.for %arg0 = %workgroup_id_z to %c16 step %workgroup_count_z { + %3 = affine.apply #map()[%workgroup_id_y] + %4 = affine.apply #map()[%workgroup_count_y] + scf.for %arg1 = %3 to %c1024 step %4 { + %5 = affine.apply #map1()[%workgroup_id_x] + %6 = affine.apply #map1()[%workgroup_count_x] + scf.for %arg2 = %5 to %c1024 step %6 { + %subview = memref.subview %2[%arg0, %arg1, %arg2] [1, 64, 256] [1, 1, 1] : memref<16x1024x1024xf16> to memref<1x64x256xf16, strided<[1048576, 1024, 1], offset: ?>> + %subview_1 = memref.subview %0[%arg0, %arg1, 0] [1, 64, 80] [1, 1, 1] : memref<16x1024x80xf16> to memref<1x64x80xf16, strided<[81920, 80, 1], offset: ?>> + %subview_2 = memref.subview %1[%arg0, 0, %arg2] [1, 80, 256] [1, 1, 1] : memref<16x80x1024xf16> to memref<1x80x256xf16, strided<[81920, 1024, 1], offset: ?>> + linalg.fill ins(%cst_0 : f16) outs(%subview : memref<1x64x256xf16, strided<[1048576, 1024, 1], offset: ?>>) + linalg.batch_matmul {lowering_config = #config} ins(%subview_1, %subview_2 : memref<1x64x80xf16, strided<[81920, 80, 1], offset: ?>>, memref<1x80x256xf16, strided<[81920, 1024, 1], offset: ?>>) outs(%subview : memref<1x64x256xf16, strided<[1048576, 1024, 1], offset: ?>>) + linalg.generic {indexing_maps = [#map2], iterator_types = ["parallel", "parallel", "parallel"]} outs(%subview : memref<1x64x256xf16, strided<[1048576, 1024, 1], offset: ?>>) { + ^bb0(%out: f16): + %7 = arith.truncf %cst : f32 to f16 + %8 = arith.mulf %out, %7 : f16 + linalg.yield %8 : f16 } } } - return } + return } // CHECK-LABEL: func.func @batch_matmul_16x1024x1024x80() @@ -166,43 +178,49 @@ module { // CHECK: gpu.barrier // ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #map = affine_map<()[s0] -> (s0 * 512)> #map1 = affine_map<()[s0] -> (s0 * 8)> #translation = #iree_codegen.translation_info -module { - func.func @batch_matmul_f32_16x4096x40x4096() attributes {translation_info = #translation} { - %c16 = arith.constant 16 : index - %c4096 = arith.constant 4096 : index - %c40 = arith.constant 40 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<16x4096x4096xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<16x4096x40xf32> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<16x4096x40xf32> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_count_x = hal.interface.workgroup.count[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_count_y = hal.interface.workgroup.count[1] : index - %workgroup_id_z = hal.interface.workgroup.id[2] : index - %workgroup_count_z = hal.interface.workgroup.count[2] : index - scf.for %arg0 = %workgroup_id_z to %c16 step %workgroup_count_z { - %3 = affine.apply #map()[%workgroup_id_y] - %4 = affine.apply #map()[%workgroup_count_y] - scf.for %arg1 = %3 to %c4096 step %4 { - %5 = affine.apply #map1()[%workgroup_id_x] - %6 = affine.apply #map1()[%workgroup_count_x] - scf.for %arg2 = %5 to %c40 step %6 { - %subview = memref.subview %2[%arg0, %arg1, %arg2] [1, 512, 8] [1, 1, 1] : memref<16x4096x40xf32> to memref<1x512x8xf32, strided<[163840, 40, 1], offset: ?>> - %subview_0 = memref.subview %0[%arg0, %arg1, 0] [1, 512, 4096] [1, 1, 1] : memref<16x4096x4096xf32> to memref<1x512x4096xf32, strided<[16777216, 4096, 1], offset: ?>> - %subview_1 = memref.subview %1[%arg0, 0, %arg2] [1, 4096, 8] [1, 1, 1] : memref<16x4096x40xf32> to memref<1x4096x8xf32, strided<[163840, 40, 1], offset: ?>> - linalg.fill ins(%cst : f32) outs(%subview : memref<1x512x8xf32, strided<[163840, 40, 1], offset: ?>>) - linalg.batch_matmul {lowering_config = #config} ins(%subview_0, %subview_1 : memref<1x512x4096xf32, strided<[16777216, 4096, 1], offset: ?>>, memref<1x4096x8xf32, strided<[163840, 40, 1], offset: ?>>) outs(%subview : memref<1x512x8xf32, strided<[163840, 40, 1], offset: ?>>) - } +func.func @batch_matmul_f32_16x4096x40x4096() attributes {translation_info = #translation} { + %c16 = arith.constant 16 : index + %c4096 = arith.constant 4096 : index + %c40 = arith.constant 40 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<16x4096x4096xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<16x4096x40xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<16x4096x40xf32> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_count_x = hal.interface.workgroup.count[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_count_y = hal.interface.workgroup.count[1] : index + %workgroup_id_z = hal.interface.workgroup.id[2] : index + %workgroup_count_z = hal.interface.workgroup.count[2] : index + scf.for %arg0 = %workgroup_id_z to %c16 step %workgroup_count_z { + %3 = affine.apply #map()[%workgroup_id_y] + %4 = affine.apply #map()[%workgroup_count_y] + scf.for %arg1 = %3 to %c4096 step %4 { + %5 = affine.apply #map1()[%workgroup_id_x] + %6 = affine.apply #map1()[%workgroup_count_x] + scf.for %arg2 = %5 to %c40 step %6 { + %subview = memref.subview %2[%arg0, %arg1, %arg2] [1, 512, 8] [1, 1, 1] : memref<16x4096x40xf32> to memref<1x512x8xf32, strided<[163840, 40, 1], offset: ?>> + %subview_0 = memref.subview %0[%arg0, %arg1, 0] [1, 512, 4096] [1, 1, 1] : memref<16x4096x4096xf32> to memref<1x512x4096xf32, strided<[16777216, 4096, 1], offset: ?>> + %subview_1 = memref.subview %1[%arg0, 0, %arg2] [1, 4096, 8] [1, 1, 1] : memref<16x4096x40xf32> to memref<1x4096x8xf32, strided<[163840, 40, 1], offset: ?>> + linalg.fill ins(%cst : f32) outs(%subview : memref<1x512x8xf32, strided<[163840, 40, 1], offset: ?>>) + linalg.batch_matmul {lowering_config = #config} ins(%subview_0, %subview_1 : memref<1x512x4096xf32, strided<[16777216, 4096, 1], offset: ?>>, memref<1x4096x8xf32, strided<[163840, 40, 1], offset: ?>>) outs(%subview : memref<1x512x8xf32, strided<[163840, 40, 1], offset: ?>>) } } - return } + return } // CHECK-LABEL: func.func @batch_matmul_f32_16x4096x40x4096() diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_batch_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_batch_matmul.mlir index 9ffdc68063390..f04d6880b6e32 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_batch_matmul.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_batch_matmul.mlir @@ -23,9 +23,9 @@ hal.executable private @fused_fill_batch_matmul { %cst = arith.constant 0.000000e+00 : f32 %c4 = arith.constant 4 : index %c1024 = arith.constant 1024 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_conv.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_conv.mlir index 435140774b543..e0faab5b6a1cf 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_conv.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_conv.mlir @@ -23,9 +23,9 @@ hal.executable private @nhwc_conv_static_shape_f32 { %c16 = arith.constant 16 : index %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index @@ -97,9 +97,9 @@ hal.executable private @nhwc_nhwc_depthwise_conv_static_shape_f32 { %c96 = arith.constant 96 : index %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index @@ -176,10 +176,10 @@ hal.executable private @low_padded_conv { %c0 = arith.constant 0 : index %c112 = arith.constant 112 : index %c32 = arith.constant 32 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(32) offset(%c0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(32) offset(%c0) : !flow.dispatch.tensor> %4 = tensor.empty() : tensor<1x112x112x32xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index @@ -271,7 +271,7 @@ hal.executable private @low_padded_conv { #hal.descriptor_set.binding<0, storage_buffer>, #hal.descriptor_set.binding<1, storage_buffer>, #hal.descriptor_set.binding<2, storage_buffer>, - #hal.descriptor_set.binding<2, storage_buffer> + #hal.descriptor_set.binding<3, storage_buffer> ]> ]> @@ -294,10 +294,10 @@ hal.executable private @low_high_padded_nhwc_depthwise_conv { %c0 = arith.constant 0 : index %c112 = arith.constant 112 : index %c32 = arith.constant 32 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(32) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(32) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(32) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(32) offset(%c0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(32) offset(%c0) : !flow.dispatch.tensor> %4 = tensor.empty() : tensor<1x112x112x32xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index @@ -407,9 +407,9 @@ hal.executable private @nchw_conv_static_shape_f32 { %c1280 = arith.constant 1280 : index %c8 = arith.constant 8 : index %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index @@ -483,10 +483,10 @@ hal.executable private @nhwc_conv_static_shape_f16_batch2 { %c320 = arith.constant 320 : index %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_matmul.mlir index 7a60449bd5911..e194b3007d9bf 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_matmul.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_matmul.mlir @@ -21,9 +21,9 @@ hal.executable private @matmul_static_shape_f16 { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f16 %c4096 = arith.constant 4096 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index @@ -83,9 +83,9 @@ hal.executable private @matmul_static_shape_f32 { %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 %c4096 = arith.constant 4096 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_count_x = hal.interface.workgroup.count[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_pooling.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_pooling.mlir index 2149860ce0e4d..9c43800ad7f8d 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_pooling.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_pooling.mlir @@ -28,8 +28,8 @@ hal.executable private @pooling_nhwc_sum_f32 { %c8 = arith.constant 8 : index %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = tensor.empty() : tensor<12x12xf32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_to_cooperative_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_to_cooperative_ops.mlir index 77c5f3411c11f..a27be5d924b39 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_to_cooperative_ops.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_to_cooperative_ops.mlir @@ -2,9 +2,17 @@ // RUN: --pass-pipeline='builtin.module(func.func(iree-spirv-tile-to-cooperative-ops, iree-codegen-generic-vectorization, iree-spirv-vectorize-to-cooperative-ops, iree-codegen-optimize-tensor-insert-extract-slices, canonicalize, cse))' \ // RUN: %s | FileCheck %s +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info -builtin.module { func.func @matmul_256x1024x128_div_add() attributes {translation_info = #translation} { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index @@ -15,11 +23,11 @@ func.func @matmul_256x1024x128_div_add() attributes {translation_info = #transla %2 = gpu.thread_id z %alloc = memref.alloc() : memref<32x32xf16, 3> %alloc_0 = memref.alloc() : memref<32x32xf16, 3> - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<256x1024xf16> - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1024x128xf16> - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<256x128xf16> - %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : memref<256x128xf16> - %7 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : memref<256x128xf16> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<256x1024xf16> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<1024x128xf16> + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<256x128xf16> + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : memref<256x128xf16> + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) alignment(64) offset(%c0) : memref<256x128xf16> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y] @@ -70,7 +78,6 @@ func.func @matmul_256x1024x128_div_add() attributes {translation_info = #transla } return } -} // CHECK: #[[$MAP_Y:.+]] = affine_map<()[s0] -> (s0 * 16)> // CHECK: #[[$MAP_X:.+]] = affine_map<()[s0] -> ((s0 floordiv 32) * 16)> @@ -127,77 +134,83 @@ func.func @matmul_256x1024x128_div_add() attributes {translation_info = #transla // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info -builtin.module { - func.func @matmul_256x1024x128_div_add() attributes {translation_info = #translation} { - %cst = arith.constant 0.000000e+00 : f16 - %c0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c512 = arith.constant 512 : index - %c1 = arith.constant 1 : index - %0 = gpu.thread_id x - %1 = gpu.thread_id y - %2 = gpu.thread_id z - %alloc = memref.alloc() : memref<1x32x32xf16, 3> - %alloc_0 = memref.alloc() : memref<1x32x32xf16, 3> - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<16x128x512xf16> - memref.assume_alignment %3, 64 : memref<16x128x512xf16> - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<16x512x256xf16> - memref.assume_alignment %4, 64 : memref<16x512x256xf16> - %5 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<16x128x256xf16> - memref.assume_alignment %5, 64 : memref<16x128x256xf16> - %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : memref<16x128x256xf16> - memref.assume_alignment %6, 64 : memref<16x128x256xf16> - %workgroup_id_x = hal.interface.workgroup.id[0] : index - %workgroup_id_y = hal.interface.workgroup.id[1] : index - %workgroup_id_z = hal.interface.workgroup.id[2] : index - %7 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y] - %8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] - %subview = memref.subview %6[%workgroup_id_z, %7, %8] [1, 32, 32] [1, 1, 1] : memref<16x128x256xf16> to memref<1x32x32xf16, strided<[32768, 256, 1], offset: ?>> - %subview_1 = memref.subview %3[%workgroup_id_z, %7, 0] [1, 32, 512] [1, 1, 1] : memref<16x128x512xf16> to memref<1x32x512xf16, strided<[65536, 512, 1], offset: ?>> - %subview_2 = memref.subview %4[%workgroup_id_z, 0, %8] [1, 512, 32] [1, 1, 1] : memref<16x512x256xf16> to memref<1x512x32xf16, strided<[131072, 256, 1], offset: ?>> - linalg.fill {__internal_linalg_transform__ = "workgroup_memory"} - ins(%cst : f16) outs(%subview : memref<1x32x32xf16, strided<[32768, 256, 1], offset: ?>>) - scf.for %arg0 = %c0 to %c512 step %c32 { - %subview_4 = memref.subview %subview_1[0, 0, %arg0] [1, 32, 32] [1, 1, 1] : memref<1x32x512xf16, strided<[65536, 512, 1], offset: ?>> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>> - %subview_5 = memref.subview %subview_2[0, %arg0, 0] [1, 32, 32] [1, 1, 1] : memref<1x512x32xf16, strided<[131072, 256, 1], offset: ?>> to memref<1x32x32xf16, strided<[131072, 256, 1], offset: ?>> - gpu.barrier - %subview_6 = memref.subview %alloc[%c0, %c0, %c0] [1, 32, 32] [1, 1, 1] : memref<1x32x32xf16, 3> to memref<1x32x32xf16, strided<[1024, 32, 1], offset: ?>, 3> - %9 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%0, %1, %2] - %10 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 4) * 32)>()[%0] - %11 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%0, %1, %2] - %12 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 4) * 32)>()[%0] - %subview_7 = memref.subview %subview_4[0, %9, %10] [1, 1, 8] [1, 1, 1] : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>> to memref<1x1x8xf16, strided<[65536, 512, 1], offset: ?>> - %subview_8 = memref.subview %subview_6[0, %11, %12] [1, 1, 8] [1, 1, 1] : memref<1x32x32xf16, strided<[1024, 32, 1], offset: ?>, 3> to memref<1x1x8xf16, strided<[1024, 32, 1], offset: ?>, 3> - %13 = vector.transfer_read %subview_7[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8xf16, strided<[65536, 512, 1], offset: ?>>, vector<1x1x8xf16> - vector.transfer_write %13, %subview_8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x1x8xf16, strided<[1024, 32, 1], offset: ?>, 3> - %subview_9 = memref.subview %alloc_0[%c0, %c0, %c0] [1, 32, 32] [1, 1, 1] : memref<1x32x32xf16, 3> to memref<1x32x32xf16, strided<[1024, 32, 1], offset: ?>, 3> - %14 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%0, %1, %2] - %15 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 4) * 32)>()[%0] - %16 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%0, %1, %2] - %17 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 4) * 32)>()[%0] - %subview_10 = memref.subview %subview_5[0, %14, %15] [1, 1, 8] [1, 1, 1] : memref<1x32x32xf16, strided<[131072, 256, 1], offset: ?>> to memref<1x1x8xf16, strided<[131072, 256, 1], offset: ?>> - %subview_11 = memref.subview %subview_9[0, %16, %17] [1, 1, 8] [1, 1, 1] : memref<1x32x32xf16, strided<[1024, 32, 1], offset: ?>, 3> to memref<1x1x8xf16, strided<[1024, 32, 1], offset: ?>, 3> - %18 = vector.transfer_read %subview_10[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8xf16, strided<[131072, 256, 1], offset: ?>>, vector<1x1x8xf16> - vector.transfer_write %18, %subview_11[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x1x8xf16, strided<[1024, 32, 1], offset: ?>, 3> - gpu.barrier - linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_memory", lowering_config = #config} - ins(%alloc, %alloc_0 : memref<1x32x32xf16, 3>, memref<1x32x32xf16, 3>) outs(%subview : memref<1x32x32xf16, strided<[32768, 256, 1], offset: ?>>) - } - %subview_3 = memref.subview %5[%workgroup_id_z, %7, %8] [1, 32, 32] [1, 1, 1] : memref<16x128x256xf16> to memref<1x32x32xf16, strided<[32768, 256, 1], offset: ?>> - linalg.generic { - indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], - iterator_types = ["parallel", "parallel", "parallel"]} - ins(%subview_3 : memref<1x32x32xf16, strided<[32768, 256, 1], offset: ?>>) - outs(%subview : memref<1x32x32xf16, strided<[32768, 256, 1], offset: ?>>) - attrs = {__internal_linalg_transform__ = "workgroup_memory"} { - ^bb0(%in: f16, %out: f16): - %9 = arith.divf %out, %in : f16 - linalg.yield %9 : f16 - } - return +func.func @matmul_256x1024x128_div_add() attributes {translation_info = #translation} { + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %c512 = arith.constant 512 : index + %c1 = arith.constant 1 : index + %0 = gpu.thread_id x + %1 = gpu.thread_id y + %2 = gpu.thread_id z + %alloc = memref.alloc() : memref<1x32x32xf16, 3> + %alloc_0 = memref.alloc() : memref<1x32x32xf16, 3> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<16x128x512xf16> + memref.assume_alignment %3, 64 : memref<16x128x512xf16> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<16x512x256xf16> + memref.assume_alignment %4, 64 : memref<16x512x256xf16> + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : memref<16x128x256xf16> + memref.assume_alignment %5, 64 : memref<16x128x256xf16> + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : memref<16x128x256xf16> + memref.assume_alignment %6, 64 : memref<16x128x256xf16> + %workgroup_id_x = hal.interface.workgroup.id[0] : index + %workgroup_id_y = hal.interface.workgroup.id[1] : index + %workgroup_id_z = hal.interface.workgroup.id[2] : index + %7 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y] + %8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] + %subview = memref.subview %6[%workgroup_id_z, %7, %8] [1, 32, 32] [1, 1, 1] : memref<16x128x256xf16> to memref<1x32x32xf16, strided<[32768, 256, 1], offset: ?>> + %subview_1 = memref.subview %3[%workgroup_id_z, %7, 0] [1, 32, 512] [1, 1, 1] : memref<16x128x512xf16> to memref<1x32x512xf16, strided<[65536, 512, 1], offset: ?>> + %subview_2 = memref.subview %4[%workgroup_id_z, 0, %8] [1, 512, 32] [1, 1, 1] : memref<16x512x256xf16> to memref<1x512x32xf16, strided<[131072, 256, 1], offset: ?>> + linalg.fill {__internal_linalg_transform__ = "workgroup_memory"} + ins(%cst : f16) outs(%subview : memref<1x32x32xf16, strided<[32768, 256, 1], offset: ?>>) + scf.for %arg0 = %c0 to %c512 step %c32 { + %subview_4 = memref.subview %subview_1[0, 0, %arg0] [1, 32, 32] [1, 1, 1] : memref<1x32x512xf16, strided<[65536, 512, 1], offset: ?>> to memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>> + %subview_5 = memref.subview %subview_2[0, %arg0, 0] [1, 32, 32] [1, 1, 1] : memref<1x512x32xf16, strided<[131072, 256, 1], offset: ?>> to memref<1x32x32xf16, strided<[131072, 256, 1], offset: ?>> + gpu.barrier + %subview_6 = memref.subview %alloc[%c0, %c0, %c0] [1, 32, 32] [1, 1, 1] : memref<1x32x32xf16, 3> to memref<1x32x32xf16, strided<[1024, 32, 1], offset: ?>, 3> + %9 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%0, %1, %2] + %10 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 4) * 32)>()[%0] + %11 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%0, %1, %2] + %12 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 4) * 32)>()[%0] + %subview_7 = memref.subview %subview_4[0, %9, %10] [1, 1, 8] [1, 1, 1] : memref<1x32x32xf16, strided<[65536, 512, 1], offset: ?>> to memref<1x1x8xf16, strided<[65536, 512, 1], offset: ?>> + %subview_8 = memref.subview %subview_6[0, %11, %12] [1, 1, 8] [1, 1, 1] : memref<1x32x32xf16, strided<[1024, 32, 1], offset: ?>, 3> to memref<1x1x8xf16, strided<[1024, 32, 1], offset: ?>, 3> + %13 = vector.transfer_read %subview_7[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8xf16, strided<[65536, 512, 1], offset: ?>>, vector<1x1x8xf16> + vector.transfer_write %13, %subview_8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x1x8xf16, strided<[1024, 32, 1], offset: ?>, 3> + %subview_9 = memref.subview %alloc_0[%c0, %c0, %c0] [1, 32, 32] [1, 1, 1] : memref<1x32x32xf16, 3> to memref<1x32x32xf16, strided<[1024, 32, 1], offset: ?>, 3> + %14 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%0, %1, %2] + %15 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 4) * 32)>()[%0] + %16 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%0, %1, %2] + %17 = affine.apply affine_map<()[s0] -> (s0 * 8 - (s0 floordiv 4) * 32)>()[%0] + %subview_10 = memref.subview %subview_5[0, %14, %15] [1, 1, 8] [1, 1, 1] : memref<1x32x32xf16, strided<[131072, 256, 1], offset: ?>> to memref<1x1x8xf16, strided<[131072, 256, 1], offset: ?>> + %subview_11 = memref.subview %subview_9[0, %16, %17] [1, 1, 8] [1, 1, 1] : memref<1x32x32xf16, strided<[1024, 32, 1], offset: ?>, 3> to memref<1x1x8xf16, strided<[1024, 32, 1], offset: ?>, 3> + %18 = vector.transfer_read %subview_10[%c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8xf16, strided<[131072, 256, 1], offset: ?>>, vector<1x1x8xf16> + vector.transfer_write %18, %subview_11[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x8xf16>, memref<1x1x8xf16, strided<[1024, 32, 1], offset: ?>, 3> + gpu.barrier + linalg.batch_matmul {__internal_linalg_transform__ = "workgroup_memory", lowering_config = #config} + ins(%alloc, %alloc_0 : memref<1x32x32xf16, 3>, memref<1x32x32xf16, 3>) outs(%subview : memref<1x32x32xf16, strided<[32768, 256, 1], offset: ?>>) } + %subview_3 = memref.subview %5[%workgroup_id_z, %7, %8] [1, 32, 32] [1, 1, 1] : memref<16x128x256xf16> to memref<1x32x32xf16, strided<[32768, 256, 1], offset: ?>> + linalg.generic { + indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], + iterator_types = ["parallel", "parallel", "parallel"]} + ins(%subview_3 : memref<1x32x32xf16, strided<[32768, 256, 1], offset: ?>>) + outs(%subview : memref<1x32x32xf16, strided<[32768, 256, 1], offset: ?>>) + attrs = {__internal_linalg_transform__ = "workgroup_memory"} { + ^bb0(%in: f16, %out: f16): + %9 = arith.divf %out, %in : f16 + linalg.yield %9 : f16 + } + return } // CHECK: #[[$MAP_Y:.+]] = affine_map<()[s0] -> (s0 * 16)> @@ -259,9 +272,15 @@ builtin.module { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<4, storage_buffer> + ]> +]> #config = #iree_codegen.lowering_config #translation = #iree_codegen.translation_info -builtin.module { func.func @matmul_256x1024x128_mixed_signedness_int8() { %cst = arith.constant 0 : i32 %cst_i8 = arith.constant 0 : i8 @@ -273,9 +292,9 @@ func.func @matmul_256x1024x128_mixed_signedness_int8() { %2 = gpu.thread_id z %alloc = memref.alloc() : memref<32x32xi8, 3> %alloc_0 = memref.alloc() : memref<32x32xi8, 3> - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<256x1024xi8> - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<1024x128xi8> - %7 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) alignment(64) offset(%c0) : memref<256x128xi32> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<256x1024xi8> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<1024x128xi8> + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(4) alignment(64) offset(%c0) : memref<256x128xi32> %workgroup_id_x = hal.interface.workgroup.id[0] : index %workgroup_id_y = hal.interface.workgroup.id[1] : index %8 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y] @@ -323,7 +342,6 @@ func.func @matmul_256x1024x128_mixed_signedness_int8() { } return } -} // CHECK: #[[$MAP_Y:.+]] = affine_map<()[s0] -> (s0 * 16)> // CHECK: #[[$MAP_X:.+]] = affine_map<()[s0] -> ((s0 floordiv 32) * 16)> diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_load_store.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_load_store.mlir index 020dccdfc455c..0a2fb48099997 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_load_store.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/vectorize_load_store.mlir @@ -49,16 +49,23 @@ func.func @dont_vectorize_scalar_load(%arg0: memref<4096x4096xf32>, %x: index, % // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> + // CHECK-LABEL: func.func @resource_copy() -// CHECK: %[[A:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4096x1024xvector<4xf32>> -// CHECK: %[[B:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<4096x1024xvector<4xf32>> +// CHECK: %[[A:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<4096x1024xvector<4xf32>> +// CHECK: %[[B:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref<4096x1024xvector<4xf32>> // CHECK: %[[V:.+]] = memref.load %[[A]][%{{.*}}, %{{.*}}] : memref<4096x1024xvector<4xf32>> // CHECK: memref.store %[[V]], %[[B]][%{{.*}}, %{{.*}}] : memref<4096x1024xvector<4xf32>> func.func @resource_copy() { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4096x4096xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<4096x4096xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<4096x4096xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<4096x4096xf32> %v = vector.transfer_read %0[%c0, %c0], %cst : memref<4096x4096xf32>, vector<4xf32> vector.transfer_write %v, %1[%c0, %c0] : vector<4xf32>, memref<4096x4096xf32> return @@ -66,17 +73,24 @@ func.func @resource_copy() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> + // CHECK-LABEL: func.func @resource_copy_with_offset() -// CHECK: %[[A:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%{{.*}}) : memref<2048x4096x1024xvector<4xf32>, strided<[4194304, 1024, 1], offset: ?>> -// CHECK: %[[B:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<4096x1024xvector<4xf32>> +// CHECK: %[[A:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) offset(%{{.*}}) : memref<2048x4096x1024xvector<4xf32>, strided<[4194304, 1024, 1], offset: ?>> +// CHECK: %[[B:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref<4096x1024xvector<4xf32>> // CHECK: %[[V:.+]] = memref.load %[[A]][%{{.*}}, %{{.*}}, %{{.*}}] : memref<2048x4096x1024xvector<4xf32>, strided<[4194304, 1024, 1], offset: ?>> // CHECK: memref.store %[[V]], %[[B]][%{{.*}}, %{{.*}}] : memref<4096x1024xvector<4xf32>> func.func @resource_copy_with_offset() { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %offset = hal.interface.constant.load[0] : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%offset) : memref<2048x4096x4096xf32, strided<[16777216, 4096, 1], offset: ?>> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<4096x4096xf32> + %offset = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) offset(%offset) : memref<2048x4096x4096xf32, strided<[16777216, 4096, 1], offset: ?>> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<4096x4096xf32> %v = vector.transfer_read %0[%c0, %c0, %c0], %cst : memref<2048x4096x4096xf32, strided<[16777216, 4096, 1], offset: ?>>, vector<4xf32> vector.transfer_write %v, %1[%c0, %c0] : vector<4xf32>, memref<4096x4096xf32> return @@ -84,16 +98,23 @@ func.func @resource_copy_with_offset() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> + // CHECK-LABEL: func.func @resource_copy_f16 -// CHECK: %[[A:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4096x1024xvector<4xf16>> -// CHECK: %[[B:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<4096x1024xvector<4xf16>> +// CHECK: %[[A:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<4096x1024xvector<4xf16>> +// CHECK: %[[B:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref<4096x1024xvector<4xf16>> // CHECK: %[[V:.+]] = memref.load %[[A]][%{{.*}}, %{{.*}}] : memref<4096x1024xvector<4xf16>> // CHECK: memref.store %[[V]], %[[B]][%{{.*}}, %{{.*}}] : memref<4096x1024xvector<4xf16>> func.func @resource_copy_f16() { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4096x4096xf16> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<4096x4096xf16> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<4096x4096xf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<4096x4096xf16> %v = vector.transfer_read %0[%c0, %c0], %cst : memref<4096x4096xf16>, vector<4xf16> vector.transfer_write %v, %1[%c0, %c0] : vector<4xf16>, memref<4096x4096xf16> return @@ -101,16 +122,23 @@ func.func @resource_copy_f16() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> + // CHECK-LABEL: func.func @resource_copy_8xf16 -// CHECK: %[[A:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4096x512xvector<4xf32>> -// CHECK: %[[B:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<4096x512xvector<4xf32>> +// CHECK: %[[A:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<4096x512xvector<4xf32>> +// CHECK: %[[B:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref<4096x512xvector<4xf32>> // CHECK: %[[V:.+]] = memref.load %[[A]][%{{.*}}, %{{.*}}] : memref<4096x512xvector<4xf32>> // CHECK: memref.store %[[V]], %[[B]][%{{.*}}, %{{.*}}] : memref<4096x512xvector<4xf32>> func.func @resource_copy_8xf16() { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4096x4096xf16> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<4096x4096xf16> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<4096x4096xf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<4096x4096xf16> %v = vector.transfer_read %0[%c0, %c0], %cst : memref<4096x4096xf16>, vector<8xf16> vector.transfer_write %v, %1[%c0, %c0] : vector<8xf16>, memref<4096x4096xf16> return @@ -118,19 +146,26 @@ func.func @resource_copy_8xf16() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> + // CHECK-LABEL: func.func @resource_copy_dynamic_shape() func.func @resource_copy_dynamic_shape() { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - // CHECK: %[[DIM0:.+]] = hal.interface.constant.load[0] : index - // CHECK: %[[DIM1:.+]] = hal.interface.constant.load[1] : index - %dim0 = hal.interface.constant.load[0] : index - %dim1 = hal.interface.constant.load[1] : index + // CHECK: %[[DIM0:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(0) : index + // CHECK: %[[DIM1:.+]] = hal.interface.constant.load layout({{.+}}) ordinal(1) : index + %dim0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %dim1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index - // CHECK: %[[INPUT:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref>{%[[DIM0]], %[[DIM1]]} - // CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref>{%[[DIM0]], %[[DIM1]]} - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref{%dim0, %dim1} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref{%dim0, %dim1} + // CHECK: %[[INPUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref>{%[[DIM0]], %[[DIM1]]} + // CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref>{%[[DIM0]], %[[DIM1]]} + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref{%dim0, %dim1} + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref{%dim0, %dim1} // CHECK: %[[VAL:.+]] = memref.load %[[INPUT]] // CHECK: memref.store %[[VAL]], %[[OUTPUT]] @@ -142,15 +177,22 @@ func.func @resource_copy_dynamic_shape() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> + // CHECK-LABEL: func.func @resource_copy_dynamic_last_dim() func.func @resource_copy_dynamic_last_dim() { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %dim = hal.interface.constant.load[0] : index - // CHECK: hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4096x?xf32> - // CHECK: hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<4096x?xf32> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4096x?xf32>{%dim} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<4096x?xf32>{%dim} + %dim = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + // CHECK: hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<4096x?xf32> + // CHECK: hal.interface.binding.subspan layout({{.+}}) set(0) binding(1) : memref<4096x?xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<4096x?xf32>{%dim} + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<4096x?xf32>{%dim} %v = vector.transfer_read %0[%c0, %c0], %cst : memref<4096x?xf32>, vector<4xf32> vector.transfer_write %v, %1[%c0, %c0] : vector<4xf32>, memref<4096x?xf32> return @@ -158,16 +200,23 @@ func.func @resource_copy_dynamic_last_dim() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> + // CHECK-LABEL: func.func @dont_vectorize_odd_vector_size func.func @dont_vectorize_odd_vector_size() { %cst = arith.constant 0.0 : f32 %c0 = arith.constant 0 : index // CHECK: hal.interface.binding.subspan // CHECK-SAME: memref<4x3xf32> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<4x3xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<4x3xf32> // CHECK: hal.interface.binding.subspan // CHECK-SAME: memref<4x3xf32> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<4x3xf32> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<4x3xf32> %v = vector.transfer_read %0[%c0, %c0], %cst : memref<4x3xf32>, vector<3xf32> vector.transfer_write %v, %1[%c0, %c0] : vector<3xf32>, memref<4x3xf32> return @@ -175,13 +224,20 @@ func.func @dont_vectorize_odd_vector_size() { // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> + // CHECK-LABEL: func.func @scalarize_vector_transfer_op func.func @scalarize_vector_transfer_op(%arg: vector<3xf32>) -> (vector<3xf32>) { %c0 = arith.constant 0: index %c3 = arith.constant 3: index %f0 = arith.constant 0.0 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<20xf32> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<20xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<20xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<20xf32> // CHECK-DAG: %[[INDEX0:.+]] = arith.constant 3 : index // CHECK-DAG: %[[INDEX1:.+]] = arith.constant 4 : index // CHECK-DAG: %[[INDEX2:.+]] = arith.constant 5 : index @@ -233,11 +289,17 @@ func.func @scalarize_non_minor_identity_transfer_read(%memory: memref<4x2x4xi32> // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> + // CHECK-LABEL: func.func @scalarize_non_minor_identity_transfer_write // CHECK-SAME: (%[[VALUE:.+]]: vector<4xf32>, %[[I1:.+]]: index, %[[I2:.+]]: index) func.func @scalarize_non_minor_identity_transfer_write(%value: vector<4xf32>, %i1: index, %i2: index) { %c0 = arith.constant 0: index - %buffer = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : memref<1x130x130x64xf32> + %buffer = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : memref<1x130x130x64xf32> vector.transfer_write %value, %buffer[%c0, %i1, %i2, %c0] {in_bounds = [true], permutation_map = affine_map<(d0, d1, d2, d3) -> (d2)>} : vector<4xf32>, memref<1x130x130x64xf32> return } @@ -284,10 +346,16 @@ func.func @scalarize_0d_transfer_write(%val: vector, %memory: memref<4xf32> // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> + // CHECK-LABEL: func.func @scalarize_indivisible_vector_transfer_read_op func.func @scalarize_indivisible_vector_transfer_read_op(%i: index) -> vector<4xf32> { %f0 = arith.constant 0.0 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<10xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<10xf32> %1 = vector.transfer_read %0[%i], %f0 : memref<10xf32>, vector<4xf32> return %1: vector<4xf32> } @@ -298,10 +366,16 @@ func.func @scalarize_indivisible_vector_transfer_read_op(%i: index) -> vector<4x // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> + // CHECK-LABEL: func.func @scalarize_indivisible_vector_transfer_write_op func.func @scalarize_indivisible_vector_transfer_write_op(%value: vector<4xf32>, %i: index) { %f0 = arith.constant 0.0 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<10xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<10xf32> vector.transfer_write %value, %0[%i] : vector<4xf32>, memref<10xf32> return } @@ -360,11 +434,17 @@ func.func @vectorize_alloc_with_mma_load_store_unaligned_case(%i0: index, %i1: i // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> + // CHECK-LABEL: func.func @scalarize_vector_load_op // CHECK-SAME: (%[[ARG0:.+]]: index) func.func @scalarize_vector_load_op(%i: index) -> vector<4xi32> { %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<10x10xi32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<10x10xi32> %1 = vector.load %0[%c0, %i] : memref<10x10xi32>, vector<4xi32> return %1: vector<4xi32> } @@ -389,22 +469,35 @@ func.func @scalarize_vector_load_op(%i: index) -> vector<4xi32> { // Test that the memref is not vectorized if the element type is a complex type. +#pipeline_layout = #hal.pipeline.layout + ]> +]> + // CHECK-LABEL: func.func @complex_memref func.func @complex_memref(%x: index, %y: index) -> complex { - // CHECK: hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<8x32xcomplex> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<8x32xcomplex> + // CHECK: hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<8x32xcomplex> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<8x32xcomplex> %1 = memref.load %0[%x, %y] : memref<8x32xcomplex> return %1: complex } // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> + // CHECK-LABEL: func.func @vectorize_mma_load_store_non_identity_memref // CHECK-SAME: (%[[I0:.+]]: index, %[[I1:.+]]: index) func.func @vectorize_mma_load_store_non_identity_memref(%i0: index, %i1: index) { %c0 = arith.constant 0 : index - %span0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : memref<32x1280xf16, strided<[1280, 1], offset: 11840>, #hal.descriptor_type> - %span1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : memref<32x1280xf16, strided<[1280, 1], offset: 11840>, #hal.descriptor_type> + %span0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<32x1280xf16, strided<[1280, 1], offset: 11840>, #hal.descriptor_type> + %span1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : memref<32x1280xf16, strided<[1280, 1], offset: 11840>, #hal.descriptor_type> %val = gpu.subgroup_mma_load_matrix %span0[%i0, %i1] {leadDimension = 1280 : index} : memref<32x1280xf16, strided<[1280, 1], offset: 11840>, #hal.descriptor_type> -> !gpu.mma_matrix<16x16xf16, "COp"> gpu.subgroup_mma_store_matrix %val, %span1[%i0, %i1] {leadDimension = 1280 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<32x1280xf16, strided<[1280, 1], offset: 11840>, #hal.descriptor_type> return @@ -419,16 +512,22 @@ func.func @vectorize_mma_load_store_non_identity_memref(%i0: index, %i1: index) // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> + func.func @transfer_read_i4_memref_vector8(%x: index) -> vector<8xi4> { %c0_i4 = arith.constant 0 : i4 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<2048xi4> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<2048xi4> %1 = vector.transfer_read %0[%x], %c0_i4 {in_bounds = [true]} : memref<2048xi4>, vector<8xi4> return %1: vector<8xi4> } // CHECK-LABEL: func.func @transfer_read_i4_memref_vector8 // CHECK-SAME: (%[[ARG:.+]]: index) -// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<256xvector<1xi32>> +// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<256xvector<1xi32>> // CHECK: %[[INDEX:.+]] = affine.apply affine_map<()[s0] -> (s0 floordiv 8)>()[%[[ARG]]] // CHECK: %[[LOAD:.+]] = memref.load %[[SUBSPAN]][%[[INDEX]]] : memref<256xvector<1xi32>> // CHECK: %[[CAST:.+]] = vector.bitcast %[[LOAD]] : vector<1xi32> to vector<8xi4> @@ -438,14 +537,14 @@ func.func @transfer_read_i4_memref_vector8(%x: index) -> vector<8xi4> { // func.func @transfer_read_i4_memref_vector4(%x: index) -> vector<4xi4> { // %c0_i4 = arith.constant 0 : i4 -// %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<2048xi4> +// %0 = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<2048xi4> // %1 = vector.transfer_read %0[%x], %c0_i4 {in_bounds = [true]} : memref<2048xi4>, vector<4xi4> // return %1: vector<4xi4> // } // XXXXX-LABEL: func.func @transfer_read_i4_memref_vector4 // XXXXX-SAME: (%[[ARG:.+]]: index) -// XXXXX: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<512xvector<2xi8>> +// XXXXX: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<512xvector<2xi8>> // XXXXX: %[[INDEX:.+]] = affine.apply affine_map<()[s0] -> (s0 floordiv 4)>()[%[[ARG]]] // XXXXX: %[[LOAD:.+]] = memref.load %[[SUBSPAN]][%[[INDEX]]] : memref<512xvector<2xi8>> // XXXXX: %[[CAST:.+]] = vector.bitcast %[[LOAD]] : vector<2xi8> to vector<4xi4> @@ -453,16 +552,22 @@ func.func @transfer_read_i4_memref_vector8(%x: index) -> vector<8xi4> { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> + func.func @transfer_read_i4_memref_vector2(%x: index) -> vector<2xi4> { %c0_i4 = arith.constant 0 : i4 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<2048xi4> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<2048xi4> %1 = vector.transfer_read %0[%x], %c0_i4 {in_bounds = [true]} : memref<2048xi4>, vector<2xi4> return %1: vector<2xi4> } // XXXXX-LABEL: func.func @transfer_read_i4_memref_vector2 // XXXXX-SAME: (%[[ARG:.+]]: index) -// XXXXX: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024xvector<1xi8>> +// XXXXX: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<1024xvector<1xi8>> // XXXXX: %[[INDEX:.+]] = affine.apply affine_map<()[s0] -> (s0 floordiv 2)>()[%[[ARG]]] // XXXXX: %[[LOAD:.+]] = memref.load %[[SUBSPAN]][%[[INDEX]]] : memref<1024xvector<1xi8>> // XXXXX: %[[CAST:.+]] = vector.bitcast %[[LOAD]] : vector<1xi8> to vector<2xi4> @@ -470,22 +575,34 @@ func.func @transfer_read_i4_memref_vector2(%x: index) -> vector<2xi4> { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> + func.func @transfer_read_i3_memref_vector8(%x: index) -> vector<8xi3> { %c0_i3 = arith.constant 0 : i3 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<2048xi3> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<2048xi3> %1 = vector.transfer_read %0[%x], %c0_i3 {in_bounds = [true]} : memref<2048xi3>, vector<8xi3> return %1: vector<8xi3> } // CHECK-LABEL: func.func @transfer_read_i3_memref_vector8 -// CHECK: hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<2048xi3> +// CHECK: hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<2048xi3> // CHECK-COUNT-8: memref.load {{.+}} : memref<2048xi3> // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> + func.func @transfer_read_vector2_vector8(%x: index) -> (vector<2xi32>, vector<8xi32>) { %c0 = arith.constant 0 : i32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<2048xi32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<2048xi32> %1 = vector.transfer_read %0[%x], %c0 {in_bounds = [true]} : memref<2048xi32>, vector<2xi32> %2 = vector.transfer_read %0[%x], %c0 {in_bounds = [true]} : memref<2048xi32>, vector<8xi32> return %1, %2: vector<2xi32>, vector<8xi32> @@ -511,9 +628,15 @@ func.func @transfer_read_vector2_vector8(%x: index) -> (vector<2xi32>, vector<8x // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> + func.func @transfer_write_vector2_vector8(%x: index, %val0: vector<2xi32>, %val1: vector<8xi32>) { %c0 = arith.constant 0 : i32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<2048xi32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<2048xi32> vector.transfer_write %val0, %0[%x] : vector<2xi32>, memref<2048xi32> vector.transfer_write %val1, %0[%x] : vector<8xi32>, memref<2048xi32> return @@ -521,7 +644,7 @@ func.func @transfer_write_vector2_vector8(%x: index, %val0: vector<2xi32>, %val1 // CHECK-LABEL: func @transfer_write_vector2_vector8 // CHECK-SAME: (%[[INDEX:.+]]: index, %[[VAL0:.+]]: vector<2xi32>, %[[VAL1:.+]]: vector<8xi32>) -// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<1024xvector<2xi32>> +// CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(0) binding(0) : memref<1024xvector<2xi32>> // CHECK: %[[OFFSET0:.+]] = affine.apply affine_map<()[s0] -> (s0 floordiv 2)>()[%[[INDEX]]] // CHECK: memref.store %[[VAL0]], %[[SUBSPAN]][%[[OFFSET0]]] @@ -540,12 +663,19 @@ func.func @transfer_write_vector2_vector8(%x: index, %val0: vector<2xi32>, %val1 // ----- +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> + func.func @scalarize_masked_vector_transfer_op(%arg: vector<3xf32>, %mask: vector<3xi1>) -> (vector<3xf32>) { %c0 = arith.constant 0: index %c3 = arith.constant 3: index %f0 = arith.constant 0.0 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<20xf32> - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<20xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<20xf32> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref<20xf32> %3 = vector.transfer_read %0[%c3], %f0, %mask : memref<20xf32>, vector<3xf32> vector.transfer_write %arg, %2[%c3], %mask : vector<3xf32>, memref<20xf32> return %3: vector<3xf32> @@ -592,11 +722,17 @@ func.func @scalarize_masked_vector_transfer_op(%arg: vector<3xf32>, %mask: vecto // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> + func.func @extract_vector_transfer_read_mask_bits(%arg: vector<3xf32>, %index: index) -> (vector<3xf32>) { %c3 = arith.constant 3: index %f0 = arith.constant 0.0 : f32 %mask = vector.create_mask %index : vector<3xi1> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<20xf32> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref<20xf32> %1 = vector.transfer_read %0[%c3], %f0, %mask : memref<20xf32>, vector<3xf32> return %1: vector<3xf32> } diff --git a/compiler/src/iree/compiler/Codegen/VMVX/test/pipeline.mlir b/compiler/src/iree/compiler/Codegen/VMVX/test/pipeline.mlir index 6ca9c77729c9b..38cc29a25d0d6 100644 --- a/compiler/src/iree/compiler/Codegen/VMVX/test/pipeline.mlir +++ b/compiler/src/iree/compiler/Codegen/VMVX/test/pipeline.mlir @@ -1,10 +1,20 @@ // RUN: iree-opt --pass-pipeline="builtin.module(iree-vmvx-select-lowering-strategy, func.func(iree-vmvx-lower-executable-target))" --split-input-file %s | FileCheck %s #executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = "all"}> + +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> + #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> #map3 = affine_map<()[s0] -> (16 ceildiv s0)> + +// CHECK: func @mmt4d_i8() func.func @mmt4d_i8() attributes {hal.executable.target = #executable_target_vmvx_bytecode_fb} { %c0 = arith.constant 0 : index %c256 = arith.constant 256 : index @@ -13,21 +23,20 @@ func.func @mmt4d_i8() attributes {hal.executable.target = #executable_target_vmv %0:2 = iree_codegen.query_tile_sizes tensor<16x16xi8, #iree_encoding.encoding> -> index, index %1 = affine.apply #map3()[%0#0] %2 = affine.apply #map3()[%0#1] - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%1, %2, %0#0, %0#1} + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%1, %2, %0#0, %0#1} %4:2 = iree_codegen.query_tile_sizes tensor<16x16xi8, #iree_encoding.encoding> -> index, index %5 = affine.apply #map3()[%4#0] %6 = affine.apply #map3()[%4#1] - %7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c256) flags(ReadOnly) : !flow.dispatch.tensor>{%5, %6, %4#0, %4#1} + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c256) flags(ReadOnly) : !flow.dispatch.tensor>{%5, %6, %4#0, %4#1} %8:2 = iree_codegen.query_tile_sizes tensor<16x16xi32, #iree_encoding.encoding> -> index, index %9 = affine.apply #map3()[%8#0] %10 = affine.apply #map3()[%8#1] - %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c512) : !flow.dispatch.tensor>{%9, %10, %8#0, %8#1} + %11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c512) : !flow.dispatch.tensor>{%9, %10, %8#0, %8#1} %12 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0], sizes = [%1, %2, %0#0, %0#1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%1, %2, %0#0, %0#1} -> tensor %13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0], sizes = [%5, %6, %4#0, %4#1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%5, %6, %4#0, %4#1} -> tensor %14 = flow.dispatch.tensor.load %11, offsets = [0, 0, 0, 0], sizes = [%9, %10, %8#0, %8#1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%9, %10, %8#0, %8#1} -> tensor + // CHECK: iree_codegen.ukernel.generic "vmvx.mmt4d" %15 = linalg.mmt4d ins(%12, %13 : tensor, tensor) outs(%14 : tensor) -> tensor flow.dispatch.tensor.store %15, %11, offsets = [0, 0, 0, 0], sizes = [%9, %10, %8#0, %8#1], strides = [1, 1, 1, 1] : tensor -> !flow.dispatch.tensor>{%9, %10, %8#0, %8#1} return } -// CHECK: func @mmt4d_i8() -// CHECK: iree_codegen.ukernel.generic "vmvx.mmt4d" diff --git a/compiler/src/iree/compiler/Codegen/VMVX/test/select_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/VMVX/test/select_lowering_strategy.mlir index fc697fced2c71..5a2f6408de027 100644 --- a/compiler/src/iree/compiler/Codegen/VMVX/test/select_lowering_strategy.mlir +++ b/compiler/src/iree/compiler/Codegen/VMVX/test/select_lowering_strategy.mlir @@ -1,11 +1,18 @@ // RUN: iree-opt -pass-pipeline='builtin.module(iree-vmvx-select-lowering-strategy)' -split-input-file %s | FileCheck %s #executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb"> +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer> + ]> +]> func.func @matmul_static() attributes {hal.executable.target = #executable_target_vmvx_bytecode_fb} { %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [384, 512], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<384x512xf32> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [512, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x128xf32> %5 = tensor.empty() : tensor<384x128xf32> @@ -25,16 +32,22 @@ func.func @matmul_static() attributes {hal.executable.target = #executable_targe // ----- #executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb"> +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<(d0, d1) -> (d0, d1)> func.func @copy_op_dynamic() attributes {hal.executable.target = #executable_target_vmvx_bytecode_fb} { - %0 = hal.interface.constant.load[0] : index - %1 = hal.interface.constant.load[1] : index - %2 = hal.interface.constant.load[2] : index - %3 = hal.interface.constant.load[3] : index - %4 = hal.interface.constant.load[4] : index - %5 = hal.interface.constant.load[5] : index - %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref{%0, %1} - %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref{%2, %3} + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : index + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : index + %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : index + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : memref{%0, %1} + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : memref{%2, %3} %subview = memref.subview %7[%4, %5] [%0, %1] [1, 1] : memref to memref> linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%6 : memref) outs(%subview : memref>) { ^bb0(%in: i32, %out: i32): @@ -53,13 +66,19 @@ func.func @copy_op_dynamic() attributes {hal.executable.target = #executable_tar // ----- #executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb"> +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @static_1d_fft_stage2() attributes {hal.executable.target = #executable_target_vmvx_bytecode_fb} { %c0 = arith.constant 0 : index %c2 = arith.constant 2 : index %cst = arith.constant dense<[1.000000e+00, 6.12323426E-17]> : tensor<2xf32> %cst_0 = arith.constant dense<[-0.000000e+00, -1.000000e+00]> : tensor<2xf32> - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xf32> %3 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xf32> %4:2 = iree_linalg_ext.fft ins(%c2, %cst, %cst_0 : index, tensor<2xf32>, tensor<2xf32>) outs(%2, %3 : tensor<32xf32>, tensor<32xf32>) : tensor<32xf32>, tensor<32xf32> @@ -78,6 +97,14 @@ func.func @static_1d_fft_stage2() attributes {hal.executable.target = #executabl // ----- #executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb"> +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer>, + #hal.descriptor_set.binding<2, storage_buffer>, + #hal.descriptor_set.binding<3, storage_buffer> + ]> +]> #map = affine_map<(d0, d1) -> (d1)> #map1 = affine_map<(d0, d1) -> (d0, d1)> func.func @fusion_quant_matmul_generic() attributes {hal.executable.target = #executable_target_vmvx_bytecode_fb} { @@ -88,13 +115,13 @@ func.func @fusion_quant_matmul_generic() attributes {hal.executable.target = #ex %c127_i32 = arith.constant 127 : i32 %c107520 = arith.constant 107520 : index %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %1 = arith.index_castui %0 : i32 to index - %2 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c107520) : !flow.dispatch.tensor> - %5 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%1} - %6 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%1} + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c107520) : !flow.dispatch.tensor> + %5 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%1} + %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%1} %7 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%1, 3360], strides = [1, 1] : !flow.dispatch.tensor>{%1} -> tensor %8 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [3360, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<3360x32xi8> %9 = flow.dispatch.tensor.load %3, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xi32> @@ -131,19 +158,25 @@ func.func @fusion_quant_matmul_generic() attributes {hal.executable.target = #ex // ----- #executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb"> +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> func.func @unpack_outer_dynamic() attributes {hal.executable.target = #executable_target_vmvx_bytecode_fb} { %c131072 = arith.constant 131072 : index %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 %4 = arith.index_castui %0 : i32 to index %5 = arith.index_castui %1 : i32 to index %6 = arith.index_castui %2 : i32 to index %7 = arith.index_castui %3 : i32 to index - %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%4, %5} - %9 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c131072) : !flow.dispatch.tensor>{%6, %7} + %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%4, %5} + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c131072) : !flow.dispatch.tensor>{%6, %7} %10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [%4, %5, 32, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%4, %5} -> tensor %11 = tensor.empty(%6, %7) : tensor %unpack = tensor.unpack %10 inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %11 : tensor -> tensor @@ -161,17 +194,23 @@ func.func @unpack_outer_dynamic() attributes {hal.executable.target = #executabl // ----- #executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = true}> +#pipeline_layout = #hal.pipeline.layout, + #hal.descriptor_set.binding<1, storage_buffer> + ]> +]> #map = affine_map<()[s0] -> (1024 ceildiv s0)> #map1 = affine_map<()[s0] -> (2048 ceildiv s0)> #map2 = affine_map<(d0, d1) -> (d0, d1)> func.func @elem_pack_ukernels() attributes {hal.executable.target = #executable_target_vmvx_bytecode_fb} { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> %1:2 = iree_codegen.query_tile_sizes tensor<1024x2048xf32, #iree_encoding.encoding>> -> index, index %2 = affine.apply #map()[%1#0] %3 = affine.apply #map1()[%1#1] - %4 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%2, %3, %1#0, %1#1} + %4 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%2, %3, %1#0, %1#1} %5 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 2048], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1024x2048xf32> %6 = tensor.empty() : tensor<1024x2048xf32> %7 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<1024x2048xf32>) outs(%6 : tensor<1024x2048xf32>) { @@ -201,17 +240,22 @@ func.func @elem_pack_ukernels() attributes {hal.executable.target = #executable_ // ----- #executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout + ]> +]> func.func @copy_cst() attributes {hal.executable.target = #executable_target_vmvx_bytecode_fb} { %cst = arith.constant dense<4.200000e-01> : tensor<5x19x8x4xf32> %c32_i64 = arith.constant 32 : i64 - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 %2 = arith.extui %0 : i32 to i64 %3 = arith.extui %1 : i32 to i64 %4 = arith.shli %3, %c32_i64 : i64 %5 = arith.ori %2, %4 : i64 %6 = arith.index_castui %5 : i64 to index - %7 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%6) : !flow.dispatch.tensor> + %7 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%6) : !flow.dispatch.tensor> flow.dispatch.tensor.store %cst, %7, offsets = [0, 0, 0, 0], sizes = [5, 19, 8, 4], strides = [1, 1, 1, 1] : tensor<5x19x8x4xf32> -> !flow.dispatch.tensor> return } diff --git a/compiler/src/iree/compiler/Codegen/WGSL/test/replace_push_constants.mlir b/compiler/src/iree/compiler/Codegen/WGSL/test/replace_push_constants.mlir index bb7cb0e286754..0f2f3bd626f6e 100644 --- a/compiler/src/iree/compiler/Codegen/WGSL/test/replace_push_constants.mlir +++ b/compiler/src/iree/compiler/Codegen/WGSL/test/replace_push_constants.mlir @@ -8,14 +8,20 @@ func.func @emptyFunctionNoOp() { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> + // CHECK-LABEL: @constantLoadIndex func.func @constantLoadIndex() { - // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(uniform_buffer) offset(%c0) : !flow.dispatch.tensor>> + // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout({{.+}}) set(3) binding(0) type(uniform_buffer) offset(%c0) : !flow.dispatch.tensor>> // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor>> -> tensor<1xvector<4xi32>> // CHECK: %[[TENSOR_EXTRACT:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<1xvector<4xi32>> // CHECK: %[[VECTOR_EXTRACT:.+]] = vector.extractelement %[[TENSOR_EXTRACT]][%c0{{.*}}] : vector<4xi32> // CHECK: %[[CAST:.+]] = arith.index_cast %[[VECTOR_EXTRACT]] : i32 to index - %0 = hal.interface.constant.load[0] : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index // CHECK: = arith.index_cast %[[CAST]] : index to i32 %1 = arith.index_cast %0 : index to i32 return @@ -23,13 +29,19 @@ func.func @constantLoadIndex() { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> + // CHECK-LABEL: @constantLoadI32 func.func @constantLoadI32() { - // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(uniform_buffer) offset(%c0) : !flow.dispatch.tensor>> + // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout(#pipeline_layout) set(3) binding(0) type(uniform_buffer) offset(%c0) : !flow.dispatch.tensor>> // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor>> -> tensor<1xvector<4xi32>> // CHECK: %[[TENSOR_EXTRACT:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<1xvector<4xi32>> // CHECK: %[[VECTOR_EXTRACT:.+]] = vector.extractelement %[[TENSOR_EXTRACT]][%c0{{.*}}] : vector<4xi32> - %0 = hal.interface.constant.load[0] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 // CHECK: = math.absi %[[VECTOR_EXTRACT]] : i32 %1 = math.absi %0 : i32 return @@ -37,14 +49,20 @@ func.func @constantLoadI32() { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> + // CHECK-LABEL: @constantLoadI16 func.func @constantLoadI16() { - // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(uniform_buffer) offset(%c0) : !flow.dispatch.tensor>> + // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout(#pipeline_layout) set(3) binding(0) type(uniform_buffer) offset(%c0) : !flow.dispatch.tensor>> // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor>> -> tensor<1xvector<4xi32>> // CHECK: %[[TENSOR_EXTRACT:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<1xvector<4xi32>> // CHECK: %[[VECTOR_EXTRACT:.+]] = vector.extractelement %[[TENSOR_EXTRACT]][%c0{{.*}}] : vector<4xi32> // CHECK: %[[TRUNC:.+]] = arith.trunci %[[VECTOR_EXTRACT]] : i32 to i16 - %0 = hal.interface.constant.load[0] : i16 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i16 // CHECK: = math.absi %[[TRUNC]] : i16 %1 = math.absi %0 : i16 return @@ -52,14 +70,20 @@ func.func @constantLoadI16() { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> + // CHECK-LABEL: @constantLoadF32 func.func @constantLoadF32() { - // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(uniform_buffer) offset(%c0) : !flow.dispatch.tensor>> + // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout(#pipeline_layout) set(3) binding(0) type(uniform_buffer) offset(%c0) : !flow.dispatch.tensor>> // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor>> -> tensor<1xvector<4xi32>> // CHECK: %[[TENSOR_EXTRACT:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<1xvector<4xi32>> // CHECK: %[[VECTOR_EXTRACT:.+]] = vector.extractelement %[[TENSOR_EXTRACT]][%c0{{.*}}] : vector<4xi32> // CHECK: %[[CAST:.+]] = arith.bitcast %[[VECTOR_EXTRACT]] : i32 to f32 - %0 = hal.interface.constant.load[0] : f32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : f32 // CHECK: = math.absf %[[CAST]] : f32 %1 = math.absf %0 : f32 return @@ -67,14 +91,20 @@ func.func @constantLoadF32() { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> + // CHECK-LABEL: @constantLoadWithIndexAndAlignment func.func @constantLoadWithIndexAndAlignment() { - // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(uniform_buffer) alignment(16) offset(%c0) : !flow.dispatch.tensor> + // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout(#pipeline_layout) set(3) binding(0) type(uniform_buffer) alignment(16) offset(%c0) : !flow.dispatch.tensor> // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [2], strides = [1] : !flow.dispatch.tensor>> -> tensor<2xvector<4xi32>> // CHECK: %[[TENSOR_EXTRACT:.+]] = tensor.extract %[[LOAD]][%c1{{.*}}] : tensor<2xvector<4xi32>> // CHECK: %[[VECTOR_EXTRACT:.+]] = vector.extractelement %[[TENSOR_EXTRACT]][%c1{{.*}}] : vector<4xi32> // CHECK: %[[CAST:.+]] = arith.index_cast %[[VECTOR_EXTRACT]] : i32 to index - %0 = hal.interface.constant.load[5] alignment(16) : index + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) alignment(16) : index // CHECK: = arith.index_cast %[[CAST]] : index to i32 %1 = arith.index_cast %0 : index to i32 return @@ -82,9 +112,15 @@ func.func @constantLoadWithIndexAndAlignment() { // ----- +#pipeline_layout = #hal.pipeline.layout + ]> +]> + // CHECK-LABEL: @constantLoadMultiple func.func @constantLoadMultiple() { - // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan set(3) binding(0) type(uniform_buffer) offset(%c0) : !flow.dispatch.tensor>> + // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan layout(#pipeline_layout) set(3) binding(0) type(uniform_buffer) offset(%c0) : !flow.dispatch.tensor>> // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]], offsets = [0], sizes = [3], strides = [1] : !flow.dispatch.tensor>> -> tensor<3xvector<4xi32>> // Extracting 8 i32s from tensor<3xvector<4xi32>: @@ -96,31 +132,31 @@ func.func @constantLoadMultiple() { // CHECK: %[[TENSOR_EXTRACT_0:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<3xvector<4xi32>> // CHECK: %[[VECTOR_EXTRACT_0:.+]] = vector.extractelement %[[TENSOR_EXTRACT_0]][%c0{{.*}}] : vector<4xi32> - %0 = hal.interface.constant.load[0] : i32 + %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 // CHECK: %[[TENSOR_EXTRACT_1:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<3xvector<4xi32>> // CHECK: %[[VECTOR_EXTRACT_1:.+]] = vector.extractelement %[[TENSOR_EXTRACT_1]][%c1{{.*}}] : vector<4xi32> - %1 = hal.interface.constant.load[1] : i32 + %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32 // CHECK: %[[TENSOR_EXTRACT_2:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<3xvector<4xi32>> // CHECK: %[[VECTOR_EXTRACT_2:.+]] = vector.extractelement %[[TENSOR_EXTRACT_2]][%c2{{.*}}] : vector<4xi32> - %2 = hal.interface.constant.load[2] : i32 + %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32 // CHECK: %[[TENSOR_EXTRACT_3:.+]] = tensor.extract %[[LOAD]][%c0{{.*}}] : tensor<3xvector<4xi32>> // CHECK: %[[VECTOR_EXTRACT_3:.+]] = vector.extractelement %[[TENSOR_EXTRACT_3]][%c3{{.*}}] : vector<4xi32> - %3 = hal.interface.constant.load[3] : i32 + %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32 // CHECK: %[[TENSOR_EXTRACT_4:.+]] = tensor.extract %[[LOAD]][%c1{{.*}}] : tensor<3xvector<4xi32>> // CHECK: %[[VECTOR_EXTRACT_4:.+]] = vector.extractelement %[[TENSOR_EXTRACT_4]][%c0{{.*}}] : vector<4xi32> - %4 = hal.interface.constant.load[4] : i32 + %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32 // CHECK: %[[TENSOR_EXTRACT_5:.+]] = tensor.extract %[[LOAD]][%c1{{.*}}] : tensor<3xvector<4xi32>> // CHECK: %[[VECTOR_EXTRACT_5:.+]] = vector.extractelement %[[TENSOR_EXTRACT_5]][%c1{{.*}}] : vector<4xi32> - %5 = hal.interface.constant.load[5] : i32 + %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32 // CHECK: %[[TENSOR_EXTRACT_6:.+]] = tensor.extract %[[LOAD]][%c1{{.*}}] : tensor<3xvector<4xi32>> // CHECK: %[[VECTOR_EXTRACT_6:.+]] = vector.extractelement %[[TENSOR_EXTRACT_6]][%c2{{.*}}] : vector<4xi32> - %6 = hal.interface.constant.load[6] : i32 + %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32 // CHECK: %[[TENSOR_EXTRACT_7:.+]] = tensor.extract %[[LOAD]][%c1{{.*}}] : tensor<3xvector<4xi32>> // CHECK: %[[VECTOR_EXTRACT_7:.+]] = vector.extractelement %[[TENSOR_EXTRACT_7]][%c3{{.*}}] : vector<4xi32> - %7 = hal.interface.constant.load[7] : i32 + %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32 // CHECK: %[[TENSOR_EXTRACT_8:.+]] = tensor.extract %[[LOAD]][%c2{{.*}}] : tensor<3xvector<4xi32>> // CHECK: %[[VECTOR_EXTRACT_8:.+]] = vector.extractelement %[[TENSOR_EXTRACT_8]][%c0{{.*}}] : vector<4xi32> - %8 = hal.interface.constant.load[8] : i32 + %8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32 // CHECK: = math.absi %[[VECTOR_EXTRACT_0]] : i32 %abs_0 = math.absi %0 : i32