diff --git a/compiler/plugins/target/LLVMCPU/test/materialize_homogeneous_encodings.mlir b/compiler/plugins/target/LLVMCPU/test/materialize_homogeneous_encodings.mlir index 5d5b591a81fc..711837f97eeb 100644 --- a/compiler/plugins/target/LLVMCPU/test/materialize_homogeneous_encodings.mlir +++ b/compiler/plugins/target/LLVMCPU/test/materialize_homogeneous_encodings.mlir @@ -8,20 +8,8 @@ #device_target_llvm_cpu = #hal.device.target<"llvm-cpu", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {hal.device.targets = [#device_target_llvm_cpu]} { util.func public @lhs_encoding(%arg0: tensor) -> tensor { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %dim = tensor.dim %arg0, %c0 : tensor - %dim_0 = tensor.dim %arg0, %c1 : tensor - %0:2 = iree_encoding.upper_bound_tile_size tensor> -> index, index - %1 = affine.apply #map()[%0#0, %dim] - %2 = affine.apply #map()[%0#1, %dim_0] - %padded = tensor.pad %arg0 low[0, 0] high[%1, %2] { - ^bb0(%arg1: index, %arg2: index): - tensor.yield %cst : f32 - } : tensor to tensor - %3 = iree_encoding.set_encoding %padded : tensor -> tensor> - %4 = iree_encoding.unset_encoding %3 : tensor> -> tensor + %3 = iree_encoding.set_encoding %arg0 : tensor -> tensor>> + %4 = iree_encoding.unset_encoding %3 : tensor>> -> tensor util.return %4 : tensor } } diff --git a/compiler/plugins/target/VulkanSPIRV/test/materialize_homogeneous_encodings.mlir b/compiler/plugins/target/VulkanSPIRV/test/materialize_homogeneous_encodings.mlir index 037cda061e9c..aa728269a5a5 100644 --- a/compiler/plugins/target/VulkanSPIRV/test/materialize_homogeneous_encodings.mlir +++ b/compiler/plugins/target/VulkanSPIRV/test/materialize_homogeneous_encodings.mlir @@ -1,27 +1,14 @@ // RUN: iree-opt --split-input-file --iree-hal-device-assignment-pipeline --iree-global-opt-materialize-homogeneous-encodings %s | FileCheck %s #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb"> -#map = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> #map1 = affine_map<(d0, d1, d2) -> (d0, d2)> #map2 = affine_map<(d0, d1, d2) -> (d2, d1)> #map3 = affine_map<(d0, d1, d2) -> (d0, d1)> #device_target_vulkan = #hal.device.target<"vulkan", [#executable_target_vulkan_spirv_fb]> : !hal.device module attributes {hal.device.targets = [#device_target_vulkan]} { util.func public @lhs_encoding(%arg0: tensor) -> tensor { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %dim = tensor.dim %arg0, %c0 : tensor - %dim_0 = tensor.dim %arg0, %c1 : tensor - %0:2 = iree_encoding.upper_bound_tile_size tensor> -> index, index - %1 = affine.apply #map()[%0#0, %dim] - %2 = affine.apply #map()[%0#1, %dim_0] - %padded = tensor.pad %arg0 low[0, 0] high[%1, %2] { - ^bb0(%arg1: index, %arg2: index): - tensor.yield %cst : f32 - } : tensor to tensor - %3 = iree_encoding.set_encoding %padded : tensor -> tensor> - %4 = iree_encoding.unset_encoding %3 : tensor> -> tensor + %3 = iree_encoding.set_encoding %arg0 : tensor -> tensor>> + %4 = iree_encoding.unset_encoding %3 : tensor>> -> tensor util.return %4 : tensor } } @@ -33,7 +20,6 @@ module attributes {hal.device.targets = [#device_target_vulkan]} { // ----- -#map = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> #map1 = affine_map<(d0, d1, d2) -> (d0, d2)> #map2 = affine_map<(d0, d1, d2) -> (d2, d1)> #map3 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -43,20 +29,8 @@ module attributes {hal.device.targets = [#device_target_vulkan]} { #device_target_vulkan = #hal.device.target<"vulkan", [#executable_target_vulkan_spirv_fb]> : !hal.device module attributes {hal.device.targets = [#hal.device.select<[#device_target_vulkan, #device_target_llvm_cpu]> : !hal.device]} { util.func public @lhs_encoding(%arg0: tensor) -> tensor { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %dim = tensor.dim %arg0, %c0 : tensor - %dim_0 = tensor.dim %arg0, %c1 : tensor - %0:2 = iree_encoding.upper_bound_tile_size tensor> -> index, index - %1 = affine.apply #map()[%0#0, %dim] - %2 = affine.apply #map()[%0#1, %dim_0] - %padded = tensor.pad %arg0 low[0, 0] high[%1, %2] { - ^bb0(%arg1: index, %arg2: index): - tensor.yield %cst : f32 - } : tensor to tensor - %3 = iree_encoding.set_encoding %padded : tensor -> tensor> - %4 = iree_encoding.unset_encoding %3 : tensor> -> tensor + %3 = iree_encoding.set_encoding %arg0 : tensor -> tensor>> + %4 = iree_encoding.unset_encoding %3 : tensor>> -> tensor util.return %4 : tensor } } diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/test/llvmcpu_materialize_encoding.mlir b/compiler/src/iree/compiler/Codegen/Common/CPU/test/llvmcpu_materialize_encoding.mlir index 2a96d39970c8..65f800fa31a5 100644 --- a/compiler/src/iree/compiler/Codegen/Common/CPU/test/llvmcpu_materialize_encoding.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/CPU/test/llvmcpu_materialize_encoding.mlir @@ -37,31 +37,12 @@ func.func @set_encoding_with_padding_semantics_bf16_x86_64_avx512f() attributes func.func @set_encoding_7x7x7_matmul_LHS() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx,+avx2,+fma"}> } { - %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = arith.index_castui %0 : i32 to index - %5 = arith.index_castui %1 : i32 to index - %6 = arith.index_castui %2 : i32 to index - %7 = arith.index_castui %3 : i32 to index %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %9 = flow.dispatch.workload.ordinal %6, 2 : index - %10 = flow.dispatch.workload.ordinal %7, 3 : index - %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2]>>>{%9, %10} - %12 = flow.dispatch.workload.ordinal %4, 0 : index - %13 = flow.dispatch.workload.ordinal %5, 1 : index + %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> %14 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [7, 7], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<7x7xf32> - %15 = affine.apply affine_map<()[s0] -> ((7 ceildiv s0) * s0 - 7)>()[%12] - %16 = affine.apply affine_map<()[s0] -> ((7 ceildiv s0) * s0 - 7)>()[%13] - %padded = tensor.pad %14 low[0, 0] high[%15, %16] { - ^bb0(%arg0: index, %arg1: index): - tensor.yield %cst : f32 - } : tensor<7x7xf32> to tensor - %17 = iree_encoding.set_encoding %padded : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - flow.dispatch.tensor.store %17, %11, offsets = [0, 0], sizes = [%9, %10], strides = [1, 1] : tensor, user_indexing_maps = [#map, #map1, #map2]>> -> !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2]>>>{%9, %10} + %17 = iree_encoding.set_encoding %14 : tensor<7x7xf32> -> tensor<7x7xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + flow.dispatch.tensor.store %17, %11, offsets = [0, 0], sizes = [7, 7], strides = [1, 1] : tensor<7x7xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> return } // CHECK-LABEL: func @set_encoding_7x7x7_matmul_LHS( @@ -81,33 +62,14 @@ func.func @set_encoding_7x7x7_matmul_LHS() attributes { func.func @set_encoding_128x80x32_batch_matmul_LHS() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx,+avx2,+fma"}> } { - %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = arith.index_castui %0 : i32 to index - %5 = arith.index_castui %1 : i32 to index - %6 = arith.index_castui %2 : i32 to index - %7 = arith.index_castui %3 : i32 to index %8 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %9 = flow.dispatch.workload.ordinal %6, 2 : index - %10 = flow.dispatch.workload.ordinal %7, 3 : index - %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2]>>>{%9, %10} - %12 = flow.dispatch.workload.ordinal %4, 0 : index - %13 = flow.dispatch.workload.ordinal %5, 1 : index + %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> %14 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0], sizes = [128, 80, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x80x32xf32> - %15 = affine.apply affine_map<()[s0] -> ((32 ceildiv s0) * s0 - 32)>()[%12] - %16 = affine.apply affine_map<()[s0] -> ((80 ceildiv s0) * s0 - 80)>()[%13] - %padded = tensor.pad %14 low[0, 0, 0] high[0, %16, %15] { - ^bb0(%arg0: index, %arg1: index, %arg2: index): - tensor.yield %cst : f32 - } : tensor<128x80x32xf32> to tensor<128x?x?xf32> - %17 = iree_encoding.set_encoding %padded : tensor<128x?x?xf32> -> tensor<128x?x?xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2]>> - flow.dispatch.tensor.store %17, %11, offsets = [0, 0, 0], sizes = [128, %9, %10], strides = [1, 1, 1] - : tensor<128x?x?xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2]>> - -> !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2]>>>{%9, %10} + %17 = iree_encoding.set_encoding %14 : tensor<128x80x32xf32> -> tensor<128x80x32xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + flow.dispatch.tensor.store %17, %11, offsets = [0, 0, 0], sizes = [128, 80, 32], strides = [1, 1, 1] + : tensor<128x80x32xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + -> !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> return } // CHECK-LABEL: func @set_encoding_128x80x32_batch_matmul_LHS( @@ -126,35 +88,16 @@ func.func @set_encoding_128x80x32_batch_matmul_LHS() attributes { func.func @set_encoding_128x32x320_batch_matmul_RHS() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx,+avx2,+fma"}> } { - %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 - %3 = hal.interface.constant.load[3] : i32 - %4 = hal.interface.constant.load[4] : i32 %5 = arith.index_castui %0 {stream.alignment = 64 : index} : i32 to index - %6 = arith.index_castui %1 : i32 to index - %7 = arith.index_castui %2 : i32 to index - %8 = arith.index_castui %3 : i32 to index - %9 = arith.index_castui %4 : i32 to index %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %11 = flow.dispatch.workload.ordinal %8, 2 : index - %12 = flow.dispatch.workload.ordinal %9, 3 : index - %13 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2]>>>{%11, %12} - %14 = flow.dispatch.workload.ordinal %6, 0 : index - %15 = flow.dispatch.workload.ordinal %7, 1 : index + %13 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%5) : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> %16 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0], sizes = [128, 32, 320], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x32x320xf32> - %17 = affine.apply affine_map<()[s0] -> ((320 ceildiv s0) * s0 - 320)>()[%14] - %18 = affine.apply affine_map<()[s0] -> ((32 ceildiv s0) * s0 - 32)>()[%15] - %padded = tensor.pad %16 low[0, 0, 0] high[0, %18, %17] { - ^bb0(%arg0: index, %arg1: index, %arg2: index): - tensor.yield %cst : f32 - } : tensor<128x32x320xf32> to tensor<128x?x?xf32> - %19 = iree_encoding.set_encoding %padded : tensor<128x?x?xf32> -> tensor<128x?x?xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2]>> - flow.dispatch.tensor.store %19, %13, offsets = [0, 0, 0], sizes = [128, %11, %12], strides = [1, 1, 1] - : tensor<128x?x?xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2]>> - -> !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2]>>>{%11, %12} + %19 = iree_encoding.set_encoding %16 : tensor<128x32x320xf32> -> tensor<128x32x320xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + flow.dispatch.tensor.store %19, %13, offsets = [0, 0, 0], sizes = [128, 32, 320], strides = [1, 1, 1] + : tensor<128x32x320xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + -> !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> return } // CHECK-LABEL: func @set_encoding_128x32x320_batch_matmul_RHS( @@ -175,21 +118,14 @@ func.func @unset_encoding_128x80x320_batch_matmul_RESULT() attributes { } { %c0 = arith.constant 0 : index %0 = hal.interface.constant.load[0] : i32 - %1 = hal.interface.constant.load[1] : i32 - %2 = hal.interface.constant.load[2] : i32 %3 = arith.index_castui %0 : i32 to index - %4 = arith.index_castui %1 : i32 to index - %5 = arith.index_castui %2 : i32 to index %6 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %7 = flow.dispatch.workload.ordinal %4, 0 : index - %8 = flow.dispatch.workload.ordinal %5, 1 : index - %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%3) flags(ReadOnly) : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2]>>>{%7, %8} - %10 = flow.dispatch.tensor.load %9, offsets = [0, 0, 0], sizes = [128, %7, %8], strides = [1, 1, 1] - : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2]>>>{%7, %8} - -> tensor<128x?x?xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2]>> - %11 = iree_encoding.unset_encoding %10 : tensor<128x?x?xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2]>> -> tensor<128x?x?xf32> - %extracted_slice = tensor.extract_slice %11[0, 0, 0] [128, 80, 320] [1, 1, 1] : tensor<128x?x?xf32> to tensor<128x80x320xf32> - flow.dispatch.tensor.store %extracted_slice, %6, offsets = [0, 0, 0], sizes = [128, 80, 320], strides = [1, 1, 1] : tensor<128x80x320xf32> -> !flow.dispatch.tensor> + %9 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%3) flags(ReadOnly) : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> + %10 = flow.dispatch.tensor.load %9, offsets = [0, 0, 0], sizes = [128, 80, 320], strides = [1, 1, 1] + : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> + -> tensor<128x80x320xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %11 = iree_encoding.unset_encoding %10 : tensor<128x80x320xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<128x80x320xf32> + flow.dispatch.tensor.store %11, %6, offsets = [0, 0, 0], sizes = [128, 80, 320], strides = [1, 1, 1] : tensor<128x80x320xf32> -> !flow.dispatch.tensor> return } // CHECK-LABEL: func @unset_encoding_128x80x320_batch_matmul_RESULT() @@ -220,14 +156,14 @@ func.func @pack_gemm_fill_dynamic(%arg0 : tensor, %arg1 : tensor %d1 = tensor.dim %arg1, %c1 : tensor - %0 = iree_encoding.set_encoding %arg0 : tensor -> tensor> - %1 = iree_encoding.set_encoding %arg1 : tensor -> tensor> - %2 = tensor.empty(%d0, %d1) : tensor> - %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor>) - -> tensor> - %4 = linalg.matmul ins(%0, %1 : tensor>, tensor>) - outs(%3 : tensor>) -> tensor> - %5 = iree_encoding.unset_encoding %4 : tensor> -> tensor + %0 = iree_encoding.set_encoding %arg0 : tensor -> tensor>> + %1 = iree_encoding.set_encoding %arg1 : tensor -> tensor>> + %2 = tensor.empty(%d0, %d1) : tensor>> + %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor>>) + -> tensor>> + %4 = linalg.matmul ins(%0, %1 : tensor>>, tensor>>) + outs(%3 : tensor>>) -> tensor>> + %5 = iree_encoding.unset_encoding %4 : tensor>> -> tensor return %5 : tensor } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -261,29 +197,15 @@ func.func @matvec_shaped_matmul_lowering_f32f32f32_aarch64(%arg0: !hal.buffer_vi hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz"}> } { %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<16x16xf32> %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<16x1xf32> %2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<16x1xf32> - %padded = tensor.pad %0 low[0, 0] high[%c0, %c0] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %cst : f32 - } : tensor<16x16xf32> to tensor - %3 = iree_encoding.set_encoding %padded : tensor -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> - %padded_0 = tensor.pad %1 low[0, 0] high[%c0, %c0] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %cst : f32 - } : tensor<16x1xf32> to tensor - %4 = iree_encoding.set_encoding %padded_0 : tensor -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> - %padded_1 = tensor.pad %2 low[0, 0] high[%c0, %c0] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %cst : f32 - } : tensor<16x1xf32> to tensor - %5 = iree_encoding.set_encoding %padded_1 : tensor -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> - %6 = linalg.matmul ins(%3, %4 : tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>, tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) outs(%5 : tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> - %7 = iree_encoding.unset_encoding %6 : tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> tensor - %extracted_slice = tensor.extract_slice %7[0, 0] [16, 1] [1, 1] : tensor to tensor<16x1xf32> - %8 = hal.tensor.export %extracted_slice "output0" : tensor<16x1xf32> -> !hal.buffer_view + %3 = iree_encoding.set_encoding %0 : tensor<16x16xf32> -> tensor<16x16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> + %4 = iree_encoding.set_encoding %1 : tensor<16x1xf32> -> tensor<16x1xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> + %5 = iree_encoding.set_encoding %2 : tensor<16x1xf32> -> tensor<16x1xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> + %6 = linalg.matmul ins(%3, %4 : tensor<16x16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>>, tensor<16x1xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>>) outs(%5 : tensor<16x1xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>>) -> tensor<16x1xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> + %7 = iree_encoding.unset_encoding %6 : tensor<16x1xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> -> tensor<16x1xf32> + %8 = hal.tensor.export %7 "output0" : tensor<16x1xf32> -> !hal.buffer_view func.return %8 : !hal.buffer_view } // CHECK-LABEL: func @matvec_shaped_matmul_lowering_f32f32f32_aarch64( @@ -304,28 +226,28 @@ func.func @matmul_lowering_f32f32f32_aarch64() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -363,29 +285,15 @@ func.func @matvec_lowering_f32f32f32_aarch64(%arg0: !hal.buffer_view, %arg1: !ha hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz"}> } { %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<16x16xf32> %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<16xf32> %2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<16xf32> - %padded = tensor.pad %0 low[0, 0] high[%c0, %c0] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %cst : f32 - } : tensor<16x16xf32> to tensor - %3 = iree_encoding.set_encoding %padded : tensor -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>> - %padded_0 = tensor.pad %1 low[0] high[%c0] { - ^bb0(%arg3: index): - tensor.yield %cst : f32 - } : tensor<16xf32> to tensor - %4 = iree_encoding.set_encoding %padded_0 : tensor -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>> - %padded_1 = tensor.pad %2 low[0] high[%c0] { - ^bb0(%arg3: index): - tensor.yield %cst : f32 - } : tensor<16xf32> to tensor - %5 = iree_encoding.set_encoding %padded_1 : tensor -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>> - %6 = linalg.matvec ins(%3, %4 : tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>>, tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>>) outs(%5 : tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>>) -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>> - %7 = iree_encoding.unset_encoding %6 : tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>> -> tensor - %extracted_slice = tensor.extract_slice %7[0] [16] [1] : tensor to tensor<16xf32> - %8 = hal.tensor.export %extracted_slice "output0" : tensor<16xf32> -> !hal.buffer_view + %3 = iree_encoding.set_encoding %0 : tensor<16x16xf32> -> tensor<16x16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>> + %4 = iree_encoding.set_encoding %1 : tensor<16xf32> -> tensor<16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>> + %5 = iree_encoding.set_encoding %2 : tensor<16xf32> -> tensor<16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>> + %6 = linalg.matvec ins(%3, %4 : tensor<16x16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>>, tensor<16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>>) outs(%5 : tensor<16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>>) -> tensor<16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>> + %7 = iree_encoding.unset_encoding %6 : tensor<16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>> -> tensor<16xf32> + %8 = hal.tensor.export %7 "output0" : tensor<16xf32> -> !hal.buffer_view func.return %8 : !hal.buffer_view } // CHECK-LABEL: func @matvec_lowering_f32f32f32_aarch64( @@ -403,28 +311,28 @@ func.func @matvec_lowering_f32f32f32_aarch64() attributes { } { %c0 = arith.constant 0 : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>> + : !flow.dispatch.tensor>>> %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>> + : !flow.dispatch.tensor>>> %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>> + : !flow.dispatch.tensor>>> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [16, 16], strides = [1, 1] - : !flow.dispatch.tensor>> - -> tensor<16x16xf32, #iree_encoding.encoding> + : !flow.dispatch.tensor>>> + -> tensor<16x16xf32, #iree_encoding.encoding>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [16, 1], strides = [1, 1] - : !flow.dispatch.tensor>> - -> tensor<16x1xf32, #iree_encoding.encoding> + : !flow.dispatch.tensor>>> + -> tensor<16x1xf32, #iree_encoding.encoding>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [16, 1], strides = [1, 1] - : !flow.dispatch.tensor>> - -> tensor<16x1xf32, #iree_encoding.encoding> + : !flow.dispatch.tensor>>> + -> tensor<16x1xf32, #iree_encoding.encoding>> %6 = linalg.matmul - ins(%3, %4 : tensor<16x16xf32, #iree_encoding.encoding>, - tensor<16x1xf32, #iree_encoding.encoding>) - outs(%5 : tensor<16x1xf32, #iree_encoding.encoding>) - -> tensor<16x1xf32, #iree_encoding.encoding> + ins(%3, %4 : tensor<16x16xf32, #iree_encoding.encoding>>, + tensor<16x1xf32, #iree_encoding.encoding>>) + outs(%5 : tensor<16x1xf32, #iree_encoding.encoding>>) + -> tensor<16x1xf32, #iree_encoding.encoding>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [16, 1], strides = [1, 1] - : tensor<16x1xf32, #iree_encoding.encoding> - -> !flow.dispatch.tensor>> + : tensor<16x1xf32, #iree_encoding.encoding>> + -> !flow.dispatch.tensor>>> return } // CHECK-LABEL: func @matvec_lowering_f32f32f32_aarch64() @@ -460,28 +368,28 @@ func.func @matmul_lowering_f16f16f16_aarch64() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -523,28 +431,28 @@ func.func @matmul_lowering_f32f32f32_x86_64() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -587,28 +495,28 @@ func.func @matmul_lowering_f32f32f32_x86_64_avx2() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -650,28 +558,28 @@ func.func @matmul_lowering_f32f32f32_x86_64_avx512f() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> @@ -713,28 +621,28 @@ func.func @matmul_lowering_f16f16f32_x86_64_avx512f() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> @@ -776,28 +684,28 @@ func.func @matmul_lowering_f16f16f16_x86_64_avx512f() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> @@ -839,28 +747,28 @@ func.func @matmul_lowering_bf16bf16f32_x86_64_avx512f() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> @@ -902,28 +810,28 @@ func.func @matmul_lowering_bf16bf16bf16_x86_64_avx512f() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> @@ -965,28 +873,28 @@ func.func @matmul_lowering_bf16bf16f32_x86_64_avx512bf16() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> @@ -1030,28 +938,28 @@ func.func @matmul_lowering_bf16bf16bf16_x86_64_avx512bf16() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> @@ -1095,37 +1003,37 @@ func.func @matmul_lowering_f32f16f16_aarch64() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %lhs_f32 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %rhs = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %dest = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> - %empty = tensor.empty(%M, %K) : tensor> + %empty = tensor.empty(%M, %K) : tensor>> %lhs_f16 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} - ins(%lhs_f32 : tensor>) - outs(%empty : tensor>) { + ins(%lhs_f32 : tensor>>) + outs(%empty : tensor>>) { ^bb0(%in: f32, %out: f16): %17 = arith.truncf %in : f32 to f16 linalg.yield %17 : f16 - } -> tensor> + } -> tensor>> %6 = linalg.matmul - ins(%lhs_f16, %rhs : tensor>, - tensor>) - outs(%dest : tensor>) - -> tensor> + ins(%lhs_f16, %rhs : tensor>>, + tensor>>) + outs(%dest : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP_CEILDIV_8:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -1160,37 +1068,37 @@ func.func @matmul_lowering_f32f16f16_x86_64_avx512f() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %lhs_f32 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %rhs = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %dest = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> - %empty = tensor.empty(%M, %K) : tensor> + %empty = tensor.empty(%M, %K) : tensor>> %lhs_f16 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} - ins(%lhs_f32 : tensor>) - outs(%empty : tensor>) { + ins(%lhs_f32 : tensor>>) + outs(%empty : tensor>>) { ^bb0(%in: f32, %out: f16): %17 = arith.truncf %in : f32 to f16 linalg.yield %17 : f16 - } -> tensor> + } -> tensor>> %6 = linalg.matmul - ins(%lhs_f16, %rhs : tensor>, - tensor>) - outs(%dest : tensor>) - -> tensor> + ins(%lhs_f16, %rhs : tensor>>, + tensor>>) + outs(%dest : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } @@ -1226,28 +1134,28 @@ func.func @matmul_lowering_i8i8i32_aarch64() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-LABEL: func @matmul_lowering_i8i8i32_aarch64() @@ -1286,28 +1194,28 @@ func.func @matmul_lowering_i8i8i32_aarch64_dotprod() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -1351,28 +1259,28 @@ func.func @matmul_lowering_i8i8i32_aarch64_i8mm() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -1415,28 +1323,28 @@ func.func @matmul_lowering_i8i4i32_aarch64() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 4)> @@ -1481,28 +1389,28 @@ func.func @matmul_lowering_i8i4i32_aarch64_dotprod() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -1545,28 +1453,28 @@ func.func @matmul_lowering_i8i4i32_aarch64_i8mm() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 4)> @@ -1606,15 +1514,15 @@ func.func @matmul_lowering_i8i4i32_aarch64_i8mm() attributes { func.func @matmul_lowering_f32f32f32_aarch64_sve(%lhs : tensor, %rhs: tensor, %acc: tensor) -> tensor attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {cpu_features = "+sve", target_triple="aarch64-xyz-xyz"}> } { - %0 = iree_encoding.set_encoding %lhs : tensor -> tensor> - %1 = iree_encoding.set_encoding %rhs : tensor -> tensor> - %2 = iree_encoding.set_encoding %acc : tensor -> tensor> + %0 = iree_encoding.set_encoding %lhs : tensor -> tensor>> + %1 = iree_encoding.set_encoding %rhs : tensor -> tensor>> + %2 = iree_encoding.set_encoding %acc : tensor -> tensor>> %3 = linalg.matmul - ins(%0, %1 : tensor>, - tensor>) - outs(%2 : tensor>) - -> tensor> - %4 = iree_encoding.unset_encoding %3 : tensor> -> tensor + ins(%0, %1 : tensor>>, + tensor>>) + outs(%2 : tensor>>) + -> tensor>> + %4 = iree_encoding.unset_encoding %3 : tensor>> -> tensor return %4 : tensor } @@ -1631,15 +1539,15 @@ func.func @matmul_lowering_f32f32f32_aarch64_sve(%lhs : tensor, %rhs: t func.func @matmul_lowering_f32f32f32_riscv(%lhs : tensor, %rhs: tensor, %acc: tensor) -> tensor attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="riscv32-xyz-xyz"}> } { - %0 = iree_encoding.set_encoding %lhs : tensor -> tensor> - %1 = iree_encoding.set_encoding %rhs : tensor -> tensor> - %2 = iree_encoding.set_encoding %acc : tensor -> tensor> + %0 = iree_encoding.set_encoding %lhs : tensor -> tensor>> + %1 = iree_encoding.set_encoding %rhs : tensor -> tensor>> + %2 = iree_encoding.set_encoding %acc : tensor -> tensor>> %3 = linalg.matmul - ins(%0, %1 : tensor>, - tensor>) - outs(%2 : tensor>) - -> tensor> - %4 = iree_encoding.unset_encoding %3 : tensor> -> tensor + ins(%0, %1 : tensor>>, + tensor>>) + outs(%2 : tensor>>) + -> tensor>> + %4 = iree_encoding.unset_encoding %3 : tensor>> -> tensor return %4 : tensor } // RISC-V targets does not implement data-tiling yet. @@ -1660,28 +1568,28 @@ func.func @matmul_lowering_i8i8i32_riscv32_ukernel() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -1725,28 +1633,28 @@ func.func @matmul_lowering_i8i8i32_x86_64_avx2() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -1790,28 +1698,28 @@ func.func @matmul_lowering_i8i8i32_x86_64_avx512bw() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> @@ -1855,28 +1763,28 @@ func.func @matmul_lowering_i8i8i32_x86_64_avx512vnni() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> @@ -1915,41 +1823,26 @@ func.func @matmul_lowering_i8i8i32_x86_64_avx512vnni() attributes { func.func @extend_batch_vecmat_explicit_unit_dim(%arg0: tensor<32x1x128xi8>, %arg1: tensor<32x128x11008xi8>) -> tensor<32x1x11008xi32> attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - %c11008 = arith.constant 11008 : index - %c0_i8 = arith.constant 0 : i8 %c0_i32 = arith.constant 0 : i32 - %padded = tensor.pad %arg0 low[0, 0, 0] high[%c0, %c0, %c0] { - ^bb0(%arg2: index, %arg3: index, %arg4: index): - tensor.yield %c0_i8 : i8 - } : tensor<32x1x128xi8> to tensor - %4 = iree_encoding.set_encoding %padded : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %5 = tensor.empty(%c32, %c1, %c128) : tensor, user_indexing_maps = [#map, #map1, #map2]>> - %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%5 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) { + %4 = iree_encoding.set_encoding %arg0 : tensor<32x1x128xi8> -> tensor<32x1x128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %5 = tensor.empty() : tensor<32x1x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<32x1x128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%5 : tensor<32x1x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { ^bb0(%in: i8, %out: i32): %17 = arith.extsi %in : i8 to i32 linalg.yield %17 : i32 - } -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %padded_0 = tensor.pad %arg1 low[0, 0, 0] high[%c0, %c0, %c0] { - ^bb0(%arg2: index, %arg3: index, %arg4: index): - tensor.yield %c0_i8 : i8 - } : tensor<32x128x11008xi8> to tensor - %7 = iree_encoding.set_encoding %padded_0 : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %8 = tensor.empty(%c32, %c128, %c11008) : tensor, user_indexing_maps = [#map, #map1, #map2]>> - %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%8 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) { + } -> tensor<32x1x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %7 = iree_encoding.set_encoding %arg1 : tensor<32x128x11008xi8> -> tensor<32x128x11008xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %8 = tensor.empty() : tensor<32x128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7 : tensor<32x128x11008xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%8 : tensor<32x128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { ^bb0(%in: i8, %out: i32): %17 = arith.extsi %in : i8 to i32 linalg.yield %17 : i32 - } -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %10 = tensor.empty(%c32, %c1, %c11008) : tensor, user_indexing_maps = [#map, #map1, #map2]>> - %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %12 = linalg.batch_matmul ins(%6, %9 : tensor, user_indexing_maps = [#map, #map1, #map2]>>, tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%11 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %13 = iree_encoding.unset_encoding %12 : tensor, user_indexing_maps = [#map, #map1, #map2]>> -> tensor - %extracted_slice = tensor.extract_slice %13[0, 0, 0] [32, 1, 11008] [1, 1, 1] : tensor to tensor<32x1x11008xi32> - return %extracted_slice : tensor<32x1x11008xi32> + } -> tensor<32x128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %10 = tensor.empty() : tensor<32x1x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<32x1x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<32x1x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %12 = linalg.batch_matmul ins(%6, %9 : tensor<32x1x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>, tensor<32x128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%11 : tensor<32x1x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<32x1x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %13 = iree_encoding.unset_encoding %12 : tensor<32x1x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<32x1x11008xi32> + return %13 : tensor<32x1x11008xi32> } // CHECK: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)> @@ -1990,28 +1883,28 @@ func.func @matmul_lowering_i16i16i32_x86_64_avx2() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -2055,35 +1948,35 @@ func.func @matmul_lowering_i16ui4i32_x86_64_avx512vnni() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %out_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %rhs_i4 = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> - %empty = tensor.empty(%K, %N) : tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> + %empty = tensor.empty(%K, %N) : tensor>> %rhs_i32 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} - ins(%rhs_i4 : tensor>) outs(%empty : tensor>) { + ins(%rhs_i4 : tensor>>) outs(%empty : tensor>>) { ^bb0(%in: i4, %out: i32): %17 = arith.extui %in : i4 to i32 linalg.yield %17 : i32 - } -> tensor> + } -> tensor>> %out = flow.dispatch.tensor.load %out_binding, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %result = linalg.matmul - ins(%lhs, %rhs_i32 : tensor>, - tensor>) - outs(%out : tensor>) - -> tensor> + ins(%lhs, %rhs_i32 : tensor>>, + tensor>>) + outs(%out : tensor>>) + -> tensor>> flow.dispatch.tensor.store %result, %out_binding, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } @@ -2115,40 +2008,26 @@ func.func @matmul_lowering_i16ui4i32_x86_64_avx512vnni() attributes { func.func @vecmat(%arg0: tensor<128xi8>, %arg1: tensor<128x11008xi8>) -> tensor<11008xi32> attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - %c11008 = arith.constant 11008 : index - %c0_i8 = arith.constant 0 : i8 %c0_i32 = arith.constant 0 : i32 - %padded = tensor.pad %arg0 low[0] high[%c0] { - ^bb0(%arg2: index): - tensor.yield %c0_i8 : i8 - } : tensor<128xi8> to tensor - %4 = iree_encoding.set_encoding %padded : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %5 = tensor.empty(%c128) : tensor, user_indexing_maps = [#map, #map1, #map2]>> - %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%5 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) { + %4 = iree_encoding.set_encoding %arg0 : tensor<128xi8> -> tensor<128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %5 = tensor.empty() : tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4 : tensor<128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%5 : tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { ^bb0(%in: i8, %out: i32): %17 = arith.extsi %in : i8 to i32 linalg.yield %17 : i32 - } -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %padded_0 = tensor.pad %arg1 low[0, 0] high[%c0, %c0] { - ^bb0(%arg2: index, %arg3: index): - tensor.yield %c0_i8 : i8 - } : tensor<128x11008xi8> to tensor - %7 = iree_encoding.set_encoding %padded_0 : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %8 = tensor.empty(%c128, %c11008) : tensor, user_indexing_maps = [#map, #map1, #map2]>> - %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%8 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) { + } -> tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %7 = iree_encoding.set_encoding %arg1 : tensor<128x11008xi8> -> tensor<128x11008xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %8 = tensor.empty() : tensor<128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<128x11008xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%8 : tensor<128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { ^bb0(%in: i8, %out: i32): %17 = arith.extsi %in : i8 to i32 linalg.yield %17 : i32 - } -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %10 = tensor.empty(%c11008) : tensor, user_indexing_maps = [#map, #map1, #map2]>> - %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %12 = linalg.vecmat ins(%6, %9 : tensor, user_indexing_maps = [#map, #map1, #map2]>>, tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%11 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %13 = iree_encoding.unset_encoding %12 : tensor, user_indexing_maps = [#map, #map1, #map2]>> -> tensor - %extracted_slice = tensor.extract_slice %13[0] [11008] [1] : tensor to tensor<11008xi32> - return %extracted_slice : tensor<11008xi32> + } -> tensor<128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %10 = tensor.empty() : tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %12 = linalg.vecmat ins(%6, %9 : tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>, tensor<128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%11 : tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %13 = iree_encoding.unset_encoding %12 : tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<11008xi32> + return %13 : tensor<11008xi32> } // CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1) -> (d0, d1)> @@ -2188,40 +2067,26 @@ func.func @vecmat(%arg0: tensor<128xi8>, %arg1: tensor<128x11008xi8>) -> tensor< func.func @matvec(%arg0: tensor<11008x128xi8>, %arg1: tensor<128xi8>) -> tensor<11008xi32> attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - %c11008 = arith.constant 11008 : index - %c0_i8 = arith.constant 0 : i8 %c0_i32 = arith.constant 0 : i32 - %padded = tensor.pad %arg0 low[0, 0] high[%c0, %c0] { - ^bb0(%arg2: index, %arg3: index): - tensor.yield %c0_i8 : i8 - } : tensor<11008x128xi8> to tensor - %4 = iree_encoding.set_encoding %padded : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %5 = tensor.empty(%c11008, %c128) : tensor, user_indexing_maps = [#map, #map1, #map2]>> - %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%5 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) { + %4 = iree_encoding.set_encoding %arg0 : tensor<11008x128xi8> -> tensor<11008x128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %5 = tensor.empty() : tensor<11008x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<11008x128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%5 : tensor<11008x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { ^bb0(%in: i8, %out: i32): %17 = arith.extsi %in : i8 to i32 linalg.yield %17 : i32 - } -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %padded_0 = tensor.pad %arg1 low[0] high[%c0] { - ^bb0(%arg2: index): - tensor.yield %c0_i8 : i8 - } : tensor<128xi8> to tensor - %7 = iree_encoding.set_encoding %padded_0 : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %8 = tensor.empty(%c128) : tensor, user_indexing_maps = [#map, #map1, #map2]>> - %9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%7 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%8 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) { + } -> tensor<11008x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %7 = iree_encoding.set_encoding %arg1 : tensor<128xi8> -> tensor<128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %8 = tensor.empty() : tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%7 : tensor<128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%8 : tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { ^bb0(%in: i8, %out: i32): %17 = arith.extsi %in : i8 to i32 linalg.yield %17 : i32 - } -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %10 = tensor.empty(%c11008) : tensor, user_indexing_maps = [#map, #map1, #map2]>> - %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %12 = linalg.matvec ins(%6, %9 : tensor, user_indexing_maps = [#map, #map1, #map2]>>, tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%11 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %13 = iree_encoding.unset_encoding %12 : tensor, user_indexing_maps = [#map, #map1, #map2]>> -> tensor - %extracted_slice = tensor.extract_slice %13[0] [11008] [1] : tensor to tensor<11008xi32> - return %extracted_slice : tensor<11008xi32> + } -> tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %10 = tensor.empty() : tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %12 = linalg.matvec ins(%6, %9 : tensor<11008x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>, tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%11 : tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %13 = iree_encoding.unset_encoding %12 : tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<11008xi32> + return %13 : tensor<11008xi32> } // CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> @@ -2261,40 +2126,26 @@ func.func @matvec(%arg0: tensor<11008x128xi8>, %arg1: tensor<128xi8>) -> tensor< func.func @matvec_with_narrow_M(%arg0: tensor<15x128xi8>, %arg1: tensor<128xi8>) -> tensor<15xi32> attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - %c15 = arith.constant 15 : index - %c0_i8 = arith.constant 0 : i8 %c0_i32 = arith.constant 0 : i32 - %padded = tensor.pad %arg0 low[0, 0] high[%c0, %c0] { - ^bb0(%arg2: index, %arg3: index): - tensor.yield %c0_i8 : i8 - } : tensor<15x128xi8> to tensor - %4 = iree_encoding.set_encoding %padded : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %5 = tensor.empty(%c15, %c128) : tensor, user_indexing_maps = [#map, #map1, #map2]>> - %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%5 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) { + %4 = iree_encoding.set_encoding %arg0 : tensor<15x128xi8> -> tensor<15x128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %5 = tensor.empty() : tensor<15x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<15x128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%5 : tensor<15x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { ^bb0(%in: i8, %out: i32): %17 = arith.extsi %in : i8 to i32 linalg.yield %17 : i32 - } -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %padded_0 = tensor.pad %arg1 low[0] high[%c0] { - ^bb0(%arg2: index): - tensor.yield %c0_i8 : i8 - } : tensor<128xi8> to tensor - %7 = iree_encoding.set_encoding %padded_0 : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %8 = tensor.empty(%c128) : tensor, user_indexing_maps = [#map, #map1, #map2]>> - %9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%7 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%8 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) { + } -> tensor<15x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %7 = iree_encoding.set_encoding %arg1 : tensor<128xi8> -> tensor<128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %8 = tensor.empty() : tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%7 : tensor<128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%8 : tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { ^bb0(%in: i8, %out: i32): %17 = arith.extsi %in : i8 to i32 linalg.yield %17 : i32 - } -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %10 = tensor.empty(%c15) : tensor, user_indexing_maps = [#map, #map1, #map2]>> - %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %12 = linalg.matvec ins(%6, %9 : tensor, user_indexing_maps = [#map, #map1, #map2]>>, tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%11 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %13 = iree_encoding.unset_encoding %12 : tensor, user_indexing_maps = [#map, #map1, #map2]>> -> tensor - %extracted_slice = tensor.extract_slice %13[0] [15] [1] : tensor to tensor<15xi32> - return %extracted_slice : tensor<15xi32> + } -> tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %10 = tensor.empty() : tensor<15xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<15xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<15xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %12 = linalg.matvec ins(%6, %9 : tensor<15x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>, tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%11 : tensor<15xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<15xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %13 = iree_encoding.unset_encoding %12 : tensor<15xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<15xi32> + return %13 : tensor<15xi32> } // CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> @@ -2335,41 +2186,26 @@ func.func @matvec_with_narrow_M(%arg0: tensor<15x128xi8>, %arg1: tensor<128xi8>) func.func @batch_vecmat(%arg0: tensor<32x128xi8>, %arg1: tensor<32x128x11008xi8>) -> tensor<32x11008xi32> attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c128 = arith.constant 128 : index - %c11008 = arith.constant 11008 : index - %c0_i8 = arith.constant 0 : i8 %c0_i32 = arith.constant 0 : i32 - %padded = tensor.pad %arg0 low[0, 0] high[%c0, %c0] { - ^bb0(%arg2: index, %arg3: index): - tensor.yield %c0_i8 : i8 - } : tensor<32x128xi8> to tensor - %4 = iree_encoding.set_encoding %padded : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %5 = tensor.empty(%c32, %c128) : tensor, user_indexing_maps = [#map, #map1, #map2]>> - %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%5 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) { + %4 = iree_encoding.set_encoding %arg0 : tensor<32x128xi8> -> tensor<32x128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %5 = tensor.empty() : tensor<32x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<32x128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%5 : tensor<32x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { ^bb0(%in: i8, %out: i32): %17 = arith.extsi %in : i8 to i32 linalg.yield %17 : i32 - } -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %padded_0 = tensor.pad %arg1 low[0, 0, 0] high[%c0, %c0, %c0] { - ^bb0(%arg2: index, %arg3: index, %arg4: index): - tensor.yield %c0_i8 : i8 - } : tensor<32x128x11008xi8> to tensor - %7 = iree_encoding.set_encoding %padded_0 : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %8 = tensor.empty(%c32, %c128, %c11008) : tensor, user_indexing_maps = [#map, #map1, #map2]>> - %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%8 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) { + } -> tensor<32x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %7 = iree_encoding.set_encoding %arg1 : tensor<32x128x11008xi8> -> tensor<32x128x11008xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %8 = tensor.empty() : tensor<32x128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7 : tensor<32x128x11008xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%8 : tensor<32x128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { ^bb0(%in: i8, %out: i32): %17 = arith.extsi %in : i8 to i32 linalg.yield %17 : i32 - } -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %10 = tensor.empty(%c32, %c11008) : tensor, user_indexing_maps = [#map, #map1, #map2]>> - %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %12 = linalg.batch_vecmat ins(%6, %9 : tensor, user_indexing_maps = [#map, #map1, #map2]>>, tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%11 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %13 = iree_encoding.unset_encoding %12 : tensor, user_indexing_maps = [#map, #map1, #map2]>> -> tensor - %extracted_slice = tensor.extract_slice %13[0, 0] [32, 11008] [1, 1] : tensor to tensor<32x11008xi32> - return %extracted_slice : tensor<32x11008xi32> + } -> tensor<32x128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %10 = tensor.empty() : tensor<32x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<32x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<32x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %12 = linalg.batch_vecmat ins(%6, %9 : tensor<32x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>, tensor<32x128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%11 : tensor<32x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<32x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %13 = iree_encoding.unset_encoding %12 : tensor<32x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<32x11008xi32> + return %13 : tensor<32x11008xi32> } // CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)> @@ -2406,31 +2242,15 @@ func.func @batch_vecmat(%arg0: tensor<32x128xi8>, %arg1: tensor<32x128x11008xi8> func.func @batch_matvec(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { - %c0 = arith.constant 0 : index - %c0_i32 = arith.constant 0 : i32 - %c0_i8 = arith.constant 0 : i8 %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32x11008x128xi8> %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<32x128xi8> %2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<32x11008xi32> - %padded = tensor.pad %0 low[0, 0, 0] high[%c0, %c0, %c0] { - ^bb0(%arg3: index, %arg4: index, %arg5: index): - tensor.yield %c0_i8 : i8 - } : tensor<32x11008x128xi8> to tensor - %3 = iree_encoding.set_encoding %padded : tensor -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> - %padded_0 = tensor.pad %1 low[0, 0] high[%c0, %c0] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %c0_i8 : i8 - } : tensor<32x128xi8> to tensor - %4 = iree_encoding.set_encoding %padded_0 : tensor -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> - %padded_1 = tensor.pad %2 low[0, 0] high[%c0, %c0] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %c0_i32 : i32 - } : tensor<32x11008xi32> to tensor - %5 = iree_encoding.set_encoding %padded_1 : tensor -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> - %6 = linalg.batch_matvec ins(%3, %4 : tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>, tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) outs(%5 : tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> - %7 = iree_encoding.unset_encoding %6 : tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> tensor - %extracted_slice = tensor.extract_slice %7[0, 0] [32, 11008] [1, 1] : tensor to tensor<32x11008xi32> - %8 = hal.tensor.export %extracted_slice "output0" : tensor<32x11008xi32> -> !hal.buffer_view + %3 = iree_encoding.set_encoding %0 : tensor<32x11008x128xi8> -> tensor<32x11008x128xi8, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> + %4 = iree_encoding.set_encoding %1 : tensor<32x128xi8> -> tensor<32x128xi8, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> + %5 = iree_encoding.set_encoding %2 : tensor<32x11008xi32> -> tensor<32x11008xi32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> + %6 = linalg.batch_matvec ins(%3, %4 : tensor<32x11008x128xi8, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>>, tensor<32x128xi8, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>>) outs(%5 : tensor<32x11008xi32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>>) -> tensor<32x11008xi32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> + %7 = iree_encoding.unset_encoding %6 : tensor<32x11008xi32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> -> tensor<32x11008xi32> + %8 = hal.tensor.export %7 "output0" : tensor<32x11008xi32> -> !hal.buffer_view func.return %8 : !hal.buffer_view } @@ -2449,34 +2269,12 @@ func.func @matmul_transpose_a_f32f32f32(%arg0: tensor<256x128xf32>, %arg1: tenso %c128 = arith.constant 128 : index %cst = arith.constant 0.000000e+00 : f32 %c512 = arith.constant 512 : index - %3:2 = iree_encoding.upper_bound_tile_size tensor<256x128xf32, #iree_encoding.encoding> -> index, index - %4 = affine.apply #map3()[%3#0, %c256] - %5 = affine.apply #map3()[%3#1, %c128] - %padded = tensor.pad %arg0 low[0, 0] high[%4, %5] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %cst : f32 - } : tensor<256x128xf32> to tensor - %6 = iree_encoding.set_encoding %padded : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %7:2 = iree_encoding.upper_bound_tile_size tensor<256x512xf32, #iree_encoding.encoding> -> index, index - %8 = affine.apply #map3()[%7#0, %c256] - %9 = affine.apply #map3()[%7#1, %c512] - %padded_0 = tensor.pad %arg1 low[0, 0] high[%8, %9] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %cst : f32 - } : tensor<256x512xf32> to tensor - %10 = iree_encoding.set_encoding %padded_0 : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %11:2 = iree_encoding.upper_bound_tile_size tensor<128x512xf32, #iree_encoding.encoding> -> index, index - %12 = affine.apply #map3()[%11#0, %c128] - %13 = affine.apply #map3()[%11#1, %c512] - %padded_1 = tensor.pad %arg2 low[0, 0] high[%12, %13] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %cst : f32 - } : tensor<128x512xf32> to tensor - %14 = iree_encoding.set_encoding %padded_1 : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %15 = linalg.matmul_transpose_a ins(%6, %10 : tensor, user_indexing_maps = [#map, #map1, #map2]>>, tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%14 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %16 = iree_encoding.unset_encoding %15 : tensor, user_indexing_maps = [#map, #map1, #map2]>> -> tensor - %extracted_slice = tensor.extract_slice %16[0, 0] [128, 512] [1, 1] : tensor to tensor<128x512xf32> - return %extracted_slice : tensor<128x512xf32> + %6 = iree_encoding.set_encoding %arg0 : tensor<256x128xf32> -> tensor<256x128xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %10 = iree_encoding.set_encoding %arg1 : tensor<256x512xf32> -> tensor<256x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %14 = iree_encoding.set_encoding %arg2 : tensor<128x512xf32> -> tensor<128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %15 = linalg.matmul_transpose_a ins(%6, %10 : tensor<256x128xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>, tensor<256x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%14 : tensor<128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %16 = iree_encoding.unset_encoding %15 : tensor<128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<128x512xf32> + return %16 : tensor<128x512xf32> } // CHECK-LABEL: func.func @matmul_transpose_a_f32f32f32( @@ -2507,34 +2305,12 @@ func.func @matmul_transpose_b_f32f32f32(%arg0: tensor<128x256xf32>, %arg1: tenso %c256 = arith.constant 256 : index %cst = arith.constant 0.000000e+00 : f32 %c512 = arith.constant 512 : index - %3:2 = iree_encoding.upper_bound_tile_size tensor<128x256xf32, #iree_encoding.encoding> -> index, index - %4 = affine.apply #map3()[%3#0, %c128] - %5 = affine.apply #map3()[%3#1, %c256] - %padded = tensor.pad %arg0 low[0, 0] high[%4, %5] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %cst : f32 - } : tensor<128x256xf32> to tensor - %6 = iree_encoding.set_encoding %padded : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %7:2 = iree_encoding.upper_bound_tile_size tensor<512x256xf32, #iree_encoding.encoding> -> index, index - %8 = affine.apply #map3()[%7#0, %c512] - %9 = affine.apply #map3()[%7#1, %c256] - %padded_0 = tensor.pad %arg1 low[0, 0] high[%8, %9] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %cst : f32 - } : tensor<512x256xf32> to tensor - %10 = iree_encoding.set_encoding %padded_0 : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %11:2 = iree_encoding.upper_bound_tile_size tensor<128x512xf32, #iree_encoding.encoding> -> index, index - %12 = affine.apply #map3()[%11#0, %c128] - %13 = affine.apply #map3()[%11#1, %c512] - %padded_1 = tensor.pad %arg2 low[0, 0] high[%12, %13] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %cst : f32 - } : tensor<128x512xf32> to tensor - %14 = iree_encoding.set_encoding %padded_1 : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %15 = linalg.matmul_transpose_b ins(%6, %10 : tensor, user_indexing_maps = [#map, #map1, #map2]>>, tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%14 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %16 = iree_encoding.unset_encoding %15 : tensor, user_indexing_maps = [#map, #map1, #map2]>> -> tensor - %extracted_slice = tensor.extract_slice %16[0, 0] [128, 512] [1, 1] : tensor to tensor<128x512xf32> - return %extracted_slice : tensor<128x512xf32> + %6 = iree_encoding.set_encoding %arg0 : tensor<128x256xf32> -> tensor<128x256xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %10 = iree_encoding.set_encoding %arg1 : tensor<512x256xf32> -> tensor<512x256xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %14 = iree_encoding.set_encoding %arg2 : tensor<128x512xf32> -> tensor<128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %15 = linalg.matmul_transpose_b ins(%6, %10 : tensor<128x256xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>, tensor<512x256xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%14 : tensor<128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %16 = iree_encoding.unset_encoding %15 : tensor<128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<128x512xf32> + return %16 : tensor<128x512xf32> } // CHECK-LABEL: func.func @matmul_transpose_b_f32f32f32( @@ -2565,37 +2341,12 @@ func.func @batch_matmul_transpose_a_f32f32f32(%arg0: tensor<2x256x128xf32>, %arg %c128 = arith.constant 128 : index %cst = arith.constant 0.000000e+00 : f32 %c512 = arith.constant 512 : index - %3:3 = iree_encoding.upper_bound_tile_size tensor<2x256x128xf32, #iree_encoding.encoding> -> index, index, index - %4 = affine.apply #map3()[%3#0, %c2] - %5 = affine.apply #map3()[%3#1, %c256] - %6 = affine.apply #map3()[%3#2, %c128] - %padded = tensor.pad %arg0 low[0, 0, 0] high[%4, %5, %6] { - ^bb0(%arg3: index, %arg4: index, %arg5: index): - tensor.yield %cst : f32 - } : tensor<2x256x128xf32> to tensor - %7 = iree_encoding.set_encoding %padded : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %8:3 = iree_encoding.upper_bound_tile_size tensor<2x256x512xf32, #iree_encoding.encoding> -> index, index, index - %9 = affine.apply #map3()[%8#0, %c2] - %10 = affine.apply #map3()[%8#1, %c256] - %11 = affine.apply #map3()[%8#2, %c512] - %padded_0 = tensor.pad %arg1 low[0, 0, 0] high[%9, %10, %11] { - ^bb0(%arg3: index, %arg4: index, %arg5: index): - tensor.yield %cst : f32 - } : tensor<2x256x512xf32> to tensor - %12 = iree_encoding.set_encoding %padded_0 : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %13:3 = iree_encoding.upper_bound_tile_size tensor<2x128x512xf32, #iree_encoding.encoding> -> index, index, index - %14 = affine.apply #map3()[%13#0, %c2] - %15 = affine.apply #map3()[%13#1, %c128] - %16 = affine.apply #map3()[%13#2, %c512] - %padded_1 = tensor.pad %arg2 low[0, 0, 0] high[%14, %15, %16] { - ^bb0(%arg3: index, %arg4: index, %arg5: index): - tensor.yield %cst : f32 - } : tensor<2x128x512xf32> to tensor - %17 = iree_encoding.set_encoding %padded_1 : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %18 = linalg.batch_matmul_transpose_a ins(%7, %12 : tensor, user_indexing_maps = [#map, #map1, #map2]>>, tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%17 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %19 = iree_encoding.unset_encoding %18 : tensor, user_indexing_maps = [#map, #map1, #map2]>> -> tensor - %extracted_slice = tensor.extract_slice %19[0, 0, 0] [2, 128, 512] [1, 1, 1] : tensor to tensor<2x128x512xf32> - return %extracted_slice : tensor<2x128x512xf32> + %7 = iree_encoding.set_encoding %arg0 : tensor<2x256x128xf32> -> tensor<2x256x128xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %12 = iree_encoding.set_encoding %arg1 : tensor<2x256x512xf32> -> tensor<2x256x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %17 = iree_encoding.set_encoding %arg2 : tensor<2x128x512xf32> -> tensor<2x128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %18 = linalg.batch_matmul_transpose_a ins(%7, %12 : tensor<2x256x128xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>, tensor<2x256x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%17 : tensor<2x128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<2x128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %19 = iree_encoding.unset_encoding %18 : tensor<2x128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<2x128x512xf32> + return %19 : tensor<2x128x512xf32> } // CHECK-LABEL: func.func @batch_matmul_transpose_a_f32f32f32( @@ -2626,37 +2377,12 @@ func.func @batch_matmul_transpose_b_f32f32f32(%arg0: tensor<2x128x256xf32>, %arg %c256 = arith.constant 256 : index %cst = arith.constant 0.000000e+00 : f32 %c512 = arith.constant 512 : index - %3:3 = iree_encoding.upper_bound_tile_size tensor<2x128x256xf32, #iree_encoding.encoding> -> index, index, index - %4 = affine.apply #map3()[%3#0, %c2] - %5 = affine.apply #map3()[%3#1, %c128] - %6 = affine.apply #map3()[%3#2, %c256] - %padded = tensor.pad %arg0 low[0, 0, 0] high[%4, %5, %6] { - ^bb0(%arg3: index, %arg4: index, %arg5: index): - tensor.yield %cst : f32 - } : tensor<2x128x256xf32> to tensor - %7 = iree_encoding.set_encoding %padded : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %8:3 = iree_encoding.upper_bound_tile_size tensor<2x512x256xf32, #iree_encoding.encoding> -> index, index, index - %9 = affine.apply #map3()[%8#0, %c2] - %10 = affine.apply #map3()[%8#1, %c512] - %11 = affine.apply #map3()[%8#2, %c256] - %padded_0 = tensor.pad %arg1 low[0, 0, 0] high[%9, %10, %11] { - ^bb0(%arg3: index, %arg4: index, %arg5: index): - tensor.yield %cst : f32 - } : tensor<2x512x256xf32> to tensor - %12 = iree_encoding.set_encoding %padded_0 : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %13:3 = iree_encoding.upper_bound_tile_size tensor<2x128x512xf32, #iree_encoding.encoding> -> index, index, index - %14 = affine.apply #map3()[%13#0, %c2] - %15 = affine.apply #map3()[%13#1, %c128] - %16 = affine.apply #map3()[%13#2, %c512] - %padded_1 = tensor.pad %arg2 low[0, 0, 0] high[%14, %15, %16] { - ^bb0(%arg3: index, %arg4: index, %arg5: index): - tensor.yield %cst : f32 - } : tensor<2x128x512xf32> to tensor - %17 = iree_encoding.set_encoding %padded_1 : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %18 = linalg.batch_matmul_transpose_b ins(%7, %12 : tensor, user_indexing_maps = [#map, #map1, #map2]>>, tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%17 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor, user_indexing_maps = [#map, #map1, #map2]>> - %19 = iree_encoding.unset_encoding %18 : tensor, user_indexing_maps = [#map, #map1, #map2]>> -> tensor - %extracted_slice = tensor.extract_slice %19[0, 0, 0] [2, 128, 512] [1, 1, 1] : tensor to tensor<2x128x512xf32> - return %extracted_slice : tensor<2x128x512xf32> + %7 = iree_encoding.set_encoding %arg0 : tensor<2x128x256xf32> -> tensor<2x128x256xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %12 = iree_encoding.set_encoding %arg1 : tensor<2x512x256xf32> -> tensor<2x512x256xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %17 = iree_encoding.set_encoding %arg2 : tensor<2x128x512xf32> -> tensor<2x128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %18 = linalg.batch_matmul_transpose_b ins(%7, %12 : tensor<2x128x256xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>, tensor<2x512x256xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%17 : tensor<2x128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<2x128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %19 = iree_encoding.unset_encoding %18 : tensor<2x128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<2x128x512xf32> + return %19 : tensor<2x128x512xf32> } // CHECK-LABEL: func.func @batch_matmul_transpose_b_f32f32f32( @@ -2688,42 +2414,19 @@ func.func @generic_batch_vecmat_transposed_i16u4i32(%arg0: tensor<32x128xi16>, % %c0_i16 = arith.constant 0 : i16 %c128 = arith.constant 128 : index %c32 = arith.constant 32 : index - %0:2 = iree_encoding.upper_bound_tile_size tensor<32x128xi16, #iree_encoding.encoding> -> index, index - %1 = affine.apply #map3()[%0#0, %c32] - %2 = affine.apply #map3()[%0#1, %c128] - %padded = tensor.pad %arg0 low[0, 0] high[%1, %2] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %c0_i16 : i16 - } : tensor<32x128xi16> to tensor - %3 = iree_encoding.set_encoding %padded : tensor -> tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2]>> - %4:3 = iree_encoding.upper_bound_tile_size tensor<4096x32x128xi4, #iree_encoding.encoding> -> index, index, index - %5 = affine.apply #map3()[%4#0, %c4096] - %6 = affine.apply #map3()[%4#1, %c32] - %7 = affine.apply #map3()[%4#2, %c128] - %padded_0 = tensor.pad %arg1 low[0, 0, 0] high[%5, %6, %7] { - ^bb0(%arg3: index, %arg4: index, %arg5: index): - tensor.yield %c0_i4 : i4 - } : tensor<4096x32x128xi4> to tensor - %8 = iree_encoding.set_encoding %padded_0 : tensor -> tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2]>> - %9:2 = iree_encoding.upper_bound_tile_size tensor<4096x32xi32, #iree_encoding.encoding> -> index, index - %10 = affine.apply #map3()[%9#0, %c4096] - %11 = affine.apply #map3()[%9#1, %c32] - %padded_1 = tensor.pad %arg2 low[0, 0] high[%10, %11] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %c0_i32 : i32 - } : tensor<4096x32xi32> to tensor - %12 = iree_encoding.set_encoding %padded_1 : tensor -> tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2]>> - %13 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %8 : tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2]>>, tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2]>>) outs(%12 : tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2]>>) { + %3 = iree_encoding.set_encoding %arg0 : tensor<32x128xi16> -> tensor<32x128xi16, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %8 = iree_encoding.set_encoding %arg1 : tensor<4096x32x128xi4> -> tensor<4096x32x128xi4, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %12 = iree_encoding.set_encoding %arg2 : tensor<4096x32xi32> -> tensor<4096x32xi32, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %13 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %8 : tensor<32x128xi16, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>, tensor<4096x32x128xi4, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%12 : tensor<4096x32xi32, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { ^bb0(%in: i16, %in_2: i4, %out: i32): %15 = arith.extsi %in : i16 to i32 %16 = arith.extui %in_2 : i4 to i32 %17 = arith.muli %15, %16 : i32 %18 = arith.addi %17, %out : i32 linalg.yield %18 : i32 - } -> tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2]>> - %14 = iree_encoding.unset_encoding %13 : tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2]>> -> tensor - %extracted_slice = tensor.extract_slice %14[0, 0] [4096, 32] [1, 1] : tensor to tensor<4096x32xi32> - return %extracted_slice : tensor<4096x32xi32> + } -> tensor<4096x32xi32, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %14 = iree_encoding.unset_encoding %13 : tensor<4096x32xi32, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<4096x32xi32> + return %14 : tensor<4096x32xi32> } // CHECK: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)> diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/test/vmvx_materialize_encoding.mlir b/compiler/src/iree/compiler/Codegen/Common/CPU/test/vmvx_materialize_encoding.mlir index 4c0fd3265fbc..52d78ca65a2a 100644 --- a/compiler/src/iree/compiler/Codegen/Common/CPU/test/vmvx_materialize_encoding.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/CPU/test/vmvx_materialize_encoding.mlir @@ -11,28 +11,28 @@ func.func @matmul_lowering_i8i8i32_vmvx_ukernel() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } @@ -42,17 +42,17 @@ func.func @matmul_lowering_i8i8i32_vmvx_ukernel() attributes { // CHECK-DAG: %[[M:.+]] = hal.interface.constant.load[0] // CHECK-DAG: %[[N:.+]] = hal.interface.constant.load[1] // CHECK-DAG: %[[K:.+]] = hal.interface.constant.load[2] -// CHECK: %[[LHS_TILE_SIZES:.+]]:2 = iree_codegen.query_tile_sizes tensor> -> index, index +// CHECK: %[[LHS_TILE_SIZES:.+]]:2 = iree_codegen.query_tile_sizes tensor>> -> index, index // CHECK-DAG: %[[LHS_OUTER_SIZE0:.+]] = affine.apply #[[MAP_CEILDIV]]()[%[[M]], %[[LHS_TILE_SIZES]]#0] // CHECK-DAG: %[[LHS_OUTER_SIZE1:.+]] = affine.apply #[[MAP_CEILDIV]]()[%[[K]], %[[LHS_TILE_SIZES]]#1] // CHECK: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) // CHECK-SAME: !flow.dispatch.tensor>{%[[LHS_OUTER_SIZE0]], %[[LHS_OUTER_SIZE1]], %[[LHS_TILE_SIZES]]#0, %[[LHS_TILE_SIZES]]#1} -// CHECK: %[[RHS_TILE_SIZES:.+]]:2 = iree_codegen.query_tile_sizes tensor> -> index, index +// CHECK: %[[RHS_TILE_SIZES:.+]]:2 = iree_codegen.query_tile_sizes tensor>> -> index, index // CHECK-DAG: %[[RHS_OUTER_SIZE0:.+]] = affine.apply #[[MAP_CEILDIV]]()[%[[N]], %[[RHS_TILE_SIZES]]#0] // CHECK-DAG: %[[RHS_OUTER_SIZE1:.+]] = affine.apply #[[MAP_CEILDIV]]()[%[[K]], %[[RHS_TILE_SIZES]]#1] // CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor>{%[[RHS_OUTER_SIZE0]], %[[RHS_OUTER_SIZE1]], %[[RHS_TILE_SIZES]]#0, %[[RHS_TILE_SIZES]]#1} -// CHECK: %[[RESULT_TILE_SIZES:.+]]:2 = iree_codegen.query_tile_sizes tensor> -> index, index +// CHECK: %[[RESULT_TILE_SIZES:.+]]:2 = iree_codegen.query_tile_sizes tensor>> -> index, index // CHECK-DAG: %[[RESULT_OUTER_SIZE0:.+]] = affine.apply #[[MAP_CEILDIV]]()[%[[M]], %[[RESULT_TILE_SIZES]]#0] // CHECK-DAG: %[[RESULT_OUTER_SIZE1:.+]] = affine.apply #[[MAP_CEILDIV]]()[%[[N]], %[[RESULT_TILE_SIZES]]#1] // CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) @@ -82,17 +82,15 @@ func.func @fill_matmul(%arg0: index, %arg1: index, %arg2: index, %arg3: index, % %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4]>>>{%arg0, %arg1} - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4]>>>{%arg2, %arg3} - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4]>>>{%arg4, %arg5} - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%arg0, %arg1], strides = [1, 1] : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4]>>>{%arg0, %arg1} -> tensor, user_indexing_maps = [#map2, #map3, #map4]>> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%arg2, %arg3], strides = [1, 1] : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4]>>>{%arg2, %arg3} -> tensor, user_indexing_maps = [#map2, #map3, #map4]>> - %5 = affine.apply #map()[%arg6] - %6 = affine.apply #map1()[%arg7] - %7 = tensor.empty(%6, %5) : tensor, user_indexing_maps = [#map2, #map3, #map4]>> - %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor, user_indexing_maps = [#map2, #map3, #map4]>>) -> tensor, user_indexing_maps = [#map2, #map3, #map4]>> - %9 = linalg.matmul ins(%3, %4 : tensor, user_indexing_maps = [#map2, #map3, #map4]>>, tensor, user_indexing_maps = [#map2, #map3, #map4]>>) outs(%8 : tensor, user_indexing_maps = [#map2, #map3, #map4]>>) -> tensor, user_indexing_maps = [#map2, #map3, #map4]>> - flow.dispatch.tensor.store %9, %2, offsets = [0, 0], sizes = [%arg4, %arg5], strides = [1, 1] : tensor, user_indexing_maps = [#map2, #map3, #map4]>> -> !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4]>>>{%arg4, %arg5} + %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>> + %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>> + %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>>{%arg4, %arg5} + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 2], strides = [1, 1] : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>> -> tensor<1x2xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>> -> tensor<2x3xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>> + %7 = tensor.empty() : tensor<1x3xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>> + %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<1x3xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>) -> tensor<1x3xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>> + %9 = linalg.matmul ins(%3, %4 : tensor<1x2xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>, tensor<2x3xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>) outs(%8 : tensor<1x3xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>) -> tensor<1x3xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>> + flow.dispatch.tensor.store %9, %2, offsets = [0, 0], sizes = [1, 3], strides = [1, 1] : tensor<1x3xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>> -> !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>> return } // CHECK: func.func @fill_matmul @@ -126,27 +124,18 @@ func.func @set_encoding_dynamic() attributes { hal.executable.target = #hal.executable.target<"vmvx", "vmvx-bytecode-fb"> } { %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 %d0 = hal.interface.constant.load [0] : index %d1 = hal.interface.constant.load [1] : index - %outd0 = hal.interface.constant.load [2] : index - %outd1 = hal.interface.constant.load [3] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%d0, %d1} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%outd0, %outd1} + : !flow.dispatch.tensor>>>{%d0, %d1} %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%d0, %d1], strides = [1, 1] : !flow.dispatch.tensor>{%d0, %d1} -> tensor - %p0 = affine.apply affine_map<()[s0, s1] -> (-s0 + s1)>()[%d0, %outd0] - %p1 = affine.apply affine_map<()[s0, s1] -> (-s0 + s1)>()[%d1, %outd1] - %padded = tensor.pad %2 low[0, 0] high[%p0, %p1] { - ^bb0(%arg0: index, %arg1: index): - tensor.yield %cst : f32 - } : tensor to tensor - %3 = iree_encoding.set_encoding %padded : tensor -> tensor> - flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [%outd0, %outd1], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%outd0, %outd1} + %3 = iree_encoding.set_encoding %2 : tensor -> tensor>> + flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [%d0, %d1], strides = [1, 1] + : tensor>> + -> !flow.dispatch.tensor>>>{%d0, %d1} return } // CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -156,20 +145,18 @@ func.func @set_encoding_dynamic() attributes { // CHECK-DAG: %[[CST:.+]] = arith.constant 0.0 // CHECK-DAG: %[[D0:.+]] = hal.interface.constant.load[0] // CHECK-DAG: %[[D1:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[OUTD0:.+]] = hal.interface.constant.load[2] -// CHECK-DAG: %[[OUTD1:.+]] = hal.interface.constant.load[3] // CHECK: %[[INPUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) -// CHECK-DAG: %[[TILED_OUTD0:.+]] = affine.apply #[[MAP0]]()[%[[OUTD0]]] -// CHECK-DAG: %[[TILED_OUTD1:.+]] = affine.apply #[[MAP1]]()[%[[OUTD1]]] +// CHECK-DAG: %[[TILED_D0:.+]] = affine.apply #[[MAP0]]()[%[[D0]]] +// CHECK-DAG: %[[TILED_D1:.+]] = affine.apply #[[MAP1]]()[%[[D1]]] // CHECK-DAG: %[[OUTPUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) -// CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_OUTD0]], %[[TILED_OUTD1]]} +// CHECK-SAME: !flow.dispatch.tensor>{%[[TILED_D0]], %[[TILED_D1]]} // CHECK: %[[INPUT:.+]] = flow.dispatch.tensor.load %[[INPUT_BINDING]] // CHECK: %[[EMPTY:.+]] = tensor.empty // CHECK: %[[PACK:.+]] = tensor.pack // CHECK-SAME: %[[INPUT]] padding_value(%[[CST]] : f32) // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %[[EMPTY]] // CHECK: flow.dispatch.tensor.store %[[PACK]], %[[OUTPUT_BINDING]] -// CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_OUTD0]], %[[TILED_OUTD1]], 8, 4], strides = [1, 1, 1, 1] +// CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_D0]], %[[TILED_D1]], 8, 4], strides = [1, 1, 1, 1] // ----- @@ -183,20 +170,18 @@ func.func @unset_encoding_dynamic() attributes { %cst = arith.constant 0.000000e+00 : f32 %d0 = hal.interface.constant.load [0] : index %d1 = hal.interface.constant.load [1] : index - %outd0 = hal.interface.constant.load [2] : index - %outd1 = hal.interface.constant.load [3] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%d0, %d1} + : !flow.dispatch.tensor>>>{%d0, %d1} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>{%outd0, %outd1} + : !flow.dispatch.tensor>{%d0, %d1} %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%d0, %d1], strides = [1, 1] - : !flow.dispatch.tensor>>{%d0, %d1} - -> tensor> + : !flow.dispatch.tensor>>>{%d0, %d1} + -> tensor>> %3 = iree_encoding.unset_encoding %2 - : tensor> -> tensor - %4 = tensor.extract_slice %3[0, 0] [%outd0, %outd1] [1, 1] : tensor to tensor - flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [%outd0, %outd1], strides = [1, 1] - : tensor -> !flow.dispatch.tensor>{%outd0, %outd1} + : tensor>> -> tensor + %4 = tensor.extract_slice %3[0, 0] [%d0, %d1] [1, 1] : tensor to tensor + flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [%d0, %d1], strides = [1, 1] + : tensor -> !flow.dispatch.tensor>{%d0, %d1} return } // CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -205,8 +190,6 @@ func.func @unset_encoding_dynamic() attributes { // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[D0:.+]] = hal.interface.constant.load[0] // CHECK-DAG: %[[D1:.+]] = hal.interface.constant.load[1] -// CHECK-DAG: %[[OUTD0:.+]] = hal.interface.constant.load[2] -// CHECK-DAG: %[[OUTD1:.+]] = hal.interface.constant.load[3] // CHECK-DAG: %[[TILED_D0:.+]] = affine.apply #[[MAP0]]()[%[[D0]]] // CHECK-DAG: %[[TILED_D1:.+]] = affine.apply #[[MAP1]]()[%[[D1]]] // CHECK-DAG: %[[INPUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(0) @@ -214,7 +197,7 @@ func.func @unset_encoding_dynamic() attributes { // CHECK-DAG: %[[OUTPUT_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) // CHECK: %[[INPUT:.+]] = flow.dispatch.tensor.load %[[INPUT_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [%[[TILED_D0]], %[[TILED_D1]], 8, 4], strides = [1, 1, 1, 1] -// CHECK: %[[EMPTY:.+]] = tensor.empty(%[[OUTD0]], %[[OUTD1]]) +// CHECK: %[[EMPTY:.+]] = tensor.empty(%[[D0]], %[[D1]]) // CHECK: %[[UNPACK:.+]] = tensor.unpack %[[INPUT]] // CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [8, 4] into %[[EMPTY]] // CHECK-DAG: flow.dispatch.tensor.store %[[UNPACK]], %[[OUTPUT_BINDING]] @@ -232,28 +215,28 @@ func.func @matmul_lowering_f32f32f32_generic() attributes { %N = hal.interface.constant.load[1] : index %K = hal.interface.constant.load[2] : index %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %K} + : !flow.dispatch.tensor>>>{%M, %K} %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%K, %N} + : !flow.dispatch.tensor>>>{%K, %N} %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>{%M, %N} + : !flow.dispatch.tensor>>>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %K} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %K} + -> tensor>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%K, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%K, %N} + -> tensor>> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>{%M, %N} - -> tensor> + : !flow.dispatch.tensor>>>{%M, %N} + -> tensor>> %6 = linalg.matmul - ins(%3, %4 : tensor>, - tensor>) - outs(%5 : tensor>) - -> tensor> + ins(%3, %4 : tensor>>, + tensor>>) + outs(%5 : tensor>>) + -> tensor>> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor> - -> !flow.dispatch.tensor>>{%M, %N} + : tensor>> + -> !flow.dispatch.tensor>>>{%M, %N} return } // CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> diff --git a/compiler/src/iree/compiler/GlobalOptimization/MaterializeHomogeneousEncodings.cpp b/compiler/src/iree/compiler/GlobalOptimization/MaterializeHomogeneousEncodings.cpp index 143b9694967f..30baabc293a5 100644 --- a/compiler/src/iree/compiler/GlobalOptimization/MaterializeHomogeneousEncodings.cpp +++ b/compiler/src/iree/compiler/GlobalOptimization/MaterializeHomogeneousEncodings.cpp @@ -14,6 +14,7 @@ #include "iree/compiler/Utils/PassUtils.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/raw_ostream.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinTypes.h" @@ -33,7 +34,7 @@ class MaterializeHomogeneousEncodingsPass MaterializeHomogeneousEncodingsPass() = default; void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); + registry.insert(); } void runNopPipeline(ModuleOp &moduleOp) { diff --git a/compiler/src/iree/compiler/GlobalOptimization/Passes.cpp b/compiler/src/iree/compiler/GlobalOptimization/Passes.cpp index c86267a9829e..70d65f0d8cea 100644 --- a/compiler/src/iree/compiler/GlobalOptimization/Passes.cpp +++ b/compiler/src/iree/compiler/GlobalOptimization/Passes.cpp @@ -63,6 +63,12 @@ static llvm::cl::opt clDemoteContractionInputsToBF16Strategy( clEnumValN(DemotionOption::None, "none", "Demote no contraction ops.")), llvm::cl::init(DemotionOption::None)); +static llvm::cl::opt clPadFactor( + "iree-global-opt-pad-factor", + llvm::cl::desc("provides padding size hints that will be attached to " + "encodings."), + llvm::cl::init(32)); + void buildGlobalOptExprHoistingPassPipeline( OpPassManager &passManager, const TransformOptions &transformOptions) { IREE::Util::ExprHoistingOptions options; @@ -164,10 +170,7 @@ void buildGlobalOptimizationPassPipeline( if (transformOptions.options.dataTiling) { // TODO(hanchung): Make data-tiling passes be FunctionOpInterface pass, so // we can use `FunctionLikNest` here. - // TODO(hanchung): Make it controlable through flags. It is fine for now - // because it is an experimental path. - const int64_t kPadFactor = clEnableEarlyMaterialization ? 0 : 16; - mainPassManager.addPass(createSetEncodingPass(kPadFactor)); + mainPassManager.addPass(createSetEncodingPass(clPadFactor)); if (clEnableEarlyMaterialization) { mainPassManager.addPass(createMaterializeHomogeneousEncodingsPass()); } diff --git a/compiler/src/iree/compiler/GlobalOptimization/Passes.h b/compiler/src/iree/compiler/GlobalOptimization/Passes.h index 6ddbeaba1cb3..ac48c1bbd819 100644 --- a/compiler/src/iree/compiler/GlobalOptimization/Passes.h +++ b/compiler/src/iree/compiler/GlobalOptimization/Passes.h @@ -113,10 +113,10 @@ std::unique_ptr createRaiseSpecialOps(); std::unique_ptr> createRemoveZeroExtentTensorsPass(); -/// Sets encoding for tensors to allow tiled execution of operations. If -/// `padFactor` is set to non-zero, the padding sizes hint will be attached to -/// encodings. It makes the host and device agree with the same padding sizes. -std::unique_ptr createSetEncodingPass(int64_t padFactor = 0); +/// Sets encoding for tensors to allow tiled execution of operations. The +/// `padFactor` provides padding size hints that will be attached to encodings. +/// It makes the host and device agree with the same padding sizes. +std::unique_ptr createSetEncodingPass(int64_t padFactor = 32); /// Simplifies tensor pack/unpack ops to reshape ops. std::unique_ptr createSimplifyPackUnpackPass(); diff --git a/compiler/src/iree/compiler/GlobalOptimization/Passes.td b/compiler/src/iree/compiler/GlobalOptimization/Passes.td index 0f3bcd336229..fa6660cecdf1 100644 --- a/compiler/src/iree/compiler/GlobalOptimization/Passes.td +++ b/compiler/src/iree/compiler/GlobalOptimization/Passes.td @@ -152,11 +152,8 @@ def SetEncoding : Pass<"iree-global-opt-set-encoding", ""> { let summary = "Introduces tensor encoding for compute operations."; let constructor = "mlir::iree_compiler::GlobalOptimization::createSetEncodingPass()"; let options = [ - Option<"padFactor", "pad-factor", "int64_t", /*default=*/"0", - "The padding sizes hint will be attached to encodings if is it set" - "to non-zero. Otherwise, it creates" - "iree_encoding.upper_bound_tile_size and rely on backends to" - "resolve them.">, + Option<"padFactor", "pad-factor", "int64_t", /*default=*/"32", + "provides padding size hints that will be attached to encodings.">, ]; } diff --git a/compiler/src/iree/compiler/GlobalOptimization/SetEncoding.cpp b/compiler/src/iree/compiler/GlobalOptimization/SetEncoding.cpp index a7f84e66212a..ce9e5de37f52 100644 --- a/compiler/src/iree/compiler/GlobalOptimization/SetEncoding.cpp +++ b/compiler/src/iree/compiler/GlobalOptimization/SetEncoding.cpp @@ -50,40 +50,6 @@ using IREE::Encoding::EncodingAttr; // Utility functions //===---------------------------------------------------------------------===// -/// Pads `value` enough for any actual tile sizes that could result from -/// materialization of `encodingAttr`. -static Value pad(OpBuilder &builder, Location loc, Value source, - EncodingAttr encodingAttr) { - RankedTensorType sourceType = cast(source.getType()); - Type elemType = sourceType.getElementType(); - size_t rank = sourceType.getRank(); - RankedTensorType tensorTypeWithEncoding = - RankedTensorType::get(sourceType.getShape(), elemType, encodingAttr); - SmallVector lowPad(rank, builder.getIndexAttr(0)); - SmallVector resultTypes(rank, builder.getIndexType()); - - ValueRange encodingPaddingSizes = - builder - .create( - loc, resultTypes, TypeAttr::get(tensorTypeWithEncoding)) - .getResults(); - SmallVector highPad(rank); - AffineExpr tileExpr, shapeExpr; - bindSymbols(builder.getContext(), tileExpr, shapeExpr); - AffineExpr highPadExpr = shapeExpr.ceilDiv(tileExpr) * tileExpr - shapeExpr; - for (size_t i = 0; i < rank; ++i) { - highPad[i] = affine::makeComposedFoldedAffineApply( - builder, loc, highPadExpr, - getAsOpFoldResult({encodingPaddingSizes[i], - builder.create(loc, source, i)})); - } - - Value zero = builder.create(loc, elemType, - builder.getZeroAttr(elemType)); - return builder.create(loc, /*resultType=*/nullptr, source, - lowPad, highPad, zero); -} - Value setEncoding(OpBuilder &builder, Location loc, Value source, EncodingAttr encodingAttr) { auto sourceType = cast(source.getType()); @@ -141,35 +107,6 @@ static MatmulNarrowSizes getMatmulNarrowSizes(ShapedType outType, return narrow; } -static Value padAndSetEncoding(OpBuilder &builder, Location loc, Value source, - int64_t operandIndex, - ArrayRef operandElemTypes, - MatmulNarrowSizes narrow, - ArrayRef indexingMaps, - IREE::Encoding::EncodingOpType opType) { - MLIRContext *ctx = builder.getContext(); - // No need to specify original_type in the encoding poadded to pad(), because - // the operand there is the `source` tensor, so it will default to reading its - // original shape. - auto encodingForPad = EncodingAttr::get( - ctx, operandIndex, opType, operandElemTypes, - /*originalType=*/Type{}, narrow.M, narrow.N, indexingMaps); - Value padded = pad(builder, loc, source, encodingForPad); - // For setEncoding() below, we potentially need to specify an encoding with an - // explicit original_type, because the operand there is the padded tensor - // returned by pad() above, but we want setEncoding to be aware of the - // original source tensor shape, not the padded tensor shape. To limit IR - // verbosity, we only specify the original original_type when it differs from - // the tensor type that the encoding is applied to. - auto encodingForSetEncoding = encodingForPad; - if (padded.getType() != source.getType()) { - encodingForSetEncoding = EncodingAttr::get( - ctx, operandIndex, opType, operandElemTypes, - /*originalType=*/source.getType(), narrow.M, narrow.N, indexingMaps); - } - return setEncoding(builder, loc, padded, encodingForSetEncoding); -} - static Value unsetEncodingAndExtractSlice(OpBuilder &builder, Location loc, Value source, SmallVector sizes) { @@ -336,31 +273,18 @@ class setContractionOpEncoding Location loc = linalgOp.getLoc(); SmallVector maps = linalgOp.getIndexingMapsArray(); - Value encodedLhs, encodedRhs, encodedOut; auto opType = IREE::Encoding::EncodingOpType::matmul; - if (!padFactor) { - encodedLhs = - padAndSetEncoding(rewriter, loc, lhs, IREE::Encoding::MATMUL_LHS, - elemTypes, narrowSizes, maps, opType); - encodedRhs = - padAndSetEncoding(rewriter, loc, rhs, IREE::Encoding::MATMUL_RHS, - elemTypes, narrowSizes, maps, opType); - encodedOut = - padAndSetEncoding(rewriter, loc, out, IREE::Encoding::MATMUL_RESULT, - elemTypes, narrowSizes, maps, opType); - } else { - auto setEncodingWrapper = [&](Value src, int64_t operandIndex) -> Value { - SmallVector roundDimsTo(linalgOp.getNumLoops(), padFactor); - auto encoding = EncodingAttr::get( - linalgOp.getContext(), operandIndex, opType, elemTypes, - src.getType(), narrowSizes.M, narrowSizes.N, maps, roundDimsTo); - return setEncoding(rewriter, loc, src, encoding); - }; - encodedLhs = setEncodingWrapper(lhs, IREE::Encoding::MATMUL_LHS); - encodedRhs = setEncodingWrapper(rhs, IREE::Encoding::MATMUL_RHS); - encodedOut = setEncodingWrapper(out, IREE::Encoding::MATMUL_RESULT); - } + auto setEncodingWrapper = [&](Value src, int64_t operandIndex) -> Value { + SmallVector roundDimsTo(3, padFactor); + auto encoding = EncodingAttr::get( + linalgOp.getContext(), operandIndex, opType, elemTypes, src.getType(), + narrowSizes.M, narrowSizes.N, maps, roundDimsTo); + return setEncoding(rewriter, loc, src, encoding); + }; + Value encodedLhs = setEncodingWrapper(lhs, IREE::Encoding::MATMUL_LHS); + Value encodedRhs = setEncodingWrapper(rhs, IREE::Encoding::MATMUL_RHS); + Value encodedOut = setEncodingWrapper(out, IREE::Encoding::MATMUL_RESULT); Value opTiled = clone(rewriter, linalgOp, encodedOut.getType(), ValueRange{encodedLhs, encodedRhs, encodedOut}) ->getResult(0); @@ -381,7 +305,7 @@ class setContractionOpEncoding } private: - int64_t padFactor = 0; + int64_t padFactor = 32; }; /// Pattern to fold a `linalg.fill` -> `iree_encoding.set_encoding` diff --git a/compiler/src/iree/compiler/GlobalOptimization/test/set_encoding.mlir b/compiler/src/iree/compiler/GlobalOptimization/test/set_encoding.mlir index e3e96d76d3f8..c829bf9a6b39 100644 --- a/compiler/src/iree/compiler/GlobalOptimization/test/set_encoding.mlir +++ b/compiler/src/iree/compiler/GlobalOptimization/test/set_encoding.mlir @@ -1,5 +1,4 @@ -// RUN: iree-opt --iree-global-opt-set-encoding --cse --split-input-file %s | FileCheck %s -// RUN: iree-opt --iree-global-opt-set-encoding="pad-factor=16" --cse --split-input-file %s | FileCheck %s --check-prefix=PAD-WITHIN-ENCODING +// RUN: iree-opt --iree-global-opt-set-encoding="pad-factor=16" --cse --split-input-file %s | FileCheck %s util.func public @matmul_f32f32f32(%arg0 : tensor<100x250xf32>, %arg1 : tensor<250x500xf32>, %arg2 : tensor<100x500xf32>) -> tensor<100x500xf32> { @@ -7,7 +6,6 @@ util.func public @matmul_f32f32f32(%arg0 : tensor<100x250xf32>, %arg1 : tensor<2 outs(%arg2 : tensor<100x500xf32>) -> tensor<100x500xf32> util.return %0 : tensor<100x500xf32> } -// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> // CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -15,51 +13,17 @@ util.func public @matmul_f32f32f32(%arg0 : tensor<100x250xf32>, %arg1 : tensor<2 // CHECK-SAME: %[[ARG0:.+]]: tensor<100x250xf32> // CHECK-SAME: %[[ARG1:.+]]: tensor<250x500xf32> // CHECK-SAME: %[[ARG2:.+]]: tensor<100x500xf32> -// CHECK-DAG: %[[C100:.+]] = arith.constant 100 : index -// CHECK-DAG: %[[C250:.+]] = arith.constant 250 : index -// CHECK-DAG: %[[C500:.+]] = arith.constant 500 : index -// CHECK: %[[LHS_TILE_SIZE:.+]]:2 = iree_encoding.upper_bound_tile_size tensor<100x250xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[LHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#0, %[[C100]]] -// CHECK: %[[LHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#1, %[[C250]]] -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0] high[%[[LHS_PADDING_SIZE0]], %[[LHS_PADDING_SIZE1]]] -// CHECK: tensor<100x250xf32> to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[RHS_TILE_SIZE:.+]]:2 = iree_encoding.upper_bound_tile_size tensor<250x500xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[RHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#0, %[[C250]]] -// CHECK: %[[RHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#1, %[[C500]]] -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0] high[%[[RHS_PADDING_SIZE0]], %[[RHS_PADDING_SIZE1]]] -// CHECK: tensor<250x500xf32> to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[OUTS_TILE_SIZE:.+]]:2 = iree_encoding.upper_bound_tile_size tensor<100x500xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[OUTS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#0, %[[C100]]] -// CHECK: %[[OUTS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#1, %[[C500]]] -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0] high[%[[OUTS_PADDING_SIZE0]], %[[OUTS_PADDING_SIZE1]]] -// CHECK: tensor<100x500xf32> to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor<100x250xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor<250x500xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor<100x500xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> // CHECK: %[[MATMUL:.+]] = linalg.matmul // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : -// CHECK: %[[RESULT_PADDED:.+]] = iree_encoding.unset_encoding %[[MATMUL]] -// CHECK: %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [100, 500] [1, 1] +// CHECK: %[[RESULT:.+]] = iree_encoding.unset_encoding %[[MATMUL]] // CHECK: util.return %[[RESULT]] -// The only difference with `pad-factor` being set is creating pad ops or not. -// Having a single test for now is okay, others are covered in the other path. -// PAD-WITHIN-ENCODING-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> -// PAD-WITHIN-ENCODING-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> -// PAD-WITHIN-ENCODING-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> -// PAD-WITHIN-ENCODING: util.func public @matmul_f32f32f32( -// PAD-WITHIN-ENCODING-SAME: %[[ARG0:[a-zA-Z0-9]+]] -// PAD-WITHIN-ENCODING-SAME: %[[ARG1:[a-zA-Z0-9]+]] -// PAD-WITHIN-ENCODING-SAME: %[[ARG2:[a-zA-Z0-9]+]] -// PAD-WITHIN-ENCODING: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] -// PAD-WITHIN-ENCODING-SAME: tensor<100x250xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> -// PAD-WITHIN-ENCODING: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] -// PAD-WITHIN-ENCODING-SAME: tensor<250x500xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> -// PAD-WITHIN-ENCODING: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG2]] -// PAD-WITHIN-ENCODING-SAME: tensor<100x500xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> // ----- @@ -69,7 +33,6 @@ util.func public @matmul_f32f32f32_dynamic(%arg0 : tensor, %arg1 : tens outs(%arg2 : tensor) -> tensor util.return %0 : tensor } -// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> // CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -77,33 +40,12 @@ util.func public @matmul_f32f32f32_dynamic(%arg0 : tensor, %arg1 : tens // CHECK-SAME: %[[ARG0:.+]]: tensor, %[[ARG1:.+]]: tensor, %[[ARG2:.+]]: tensor // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK: %[[LHS_TILE_SIZE:.+]]:2 = iree_encoding.upper_bound_tile_size tensor> -> index, index -// CHECK: %[[LHS_DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]] -// CHECK: %[[LHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#0, %[[LHS_DIM0]]] -// CHECK: %[[LHS_DIM1:.+]] = tensor.dim %[[ARG0]], %[[C1]] -// CHECK: %[[LHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#1, %[[LHS_DIM1]]] -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0] high[%[[LHS_PADDING_SIZE0]], %[[LHS_PADDING_SIZE1]]] -// CHECK: tensor to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor> -// CHECK: %[[RHS_TILE_SIZE:.+]]:2 = iree_encoding.upper_bound_tile_size tensor> -> index, index -// CHECK: %[[RHS_DIM0:.+]] = tensor.dim %[[ARG1]], %[[C0]] -// CHECK: %[[RHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#0, %[[RHS_DIM0]]] -// CHECK: %[[RHS_DIM1:.+]] = tensor.dim %[[ARG1]], %[[C1]] -// CHECK: %[[RHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#1, %[[RHS_DIM1]]] -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0] high[%[[RHS_PADDING_SIZE0]], %[[RHS_PADDING_SIZE1]]] -// CHECK: tensor to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor> -// CHECK: %[[OUTS_TILE_SIZE:.+]]:2 = iree_encoding.upper_bound_tile_size tensor> -> index, index -// CHECK: %[[OUTS_DIM0:.+]] = tensor.dim %[[ARG2]], %[[C0]] -// CHECK: %[[OUTS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#0, %[[OUTS_DIM0]]] -// CHECK: %[[OUTS_DIM1:.+]] = tensor.dim %[[ARG2]], %[[C1]] -// CHECK: %[[OUTS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#1, %[[OUTS_DIM1]]] -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0] high[%[[OUTS_PADDING_SIZE0]], %[[OUTS_PADDING_SIZE1]]] -// CHECK: tensor to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> // CHECK: %[[MATMUL:.+]] = linalg.matmul // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : @@ -126,26 +68,16 @@ util.func public @matmul_i8i8i32(%arg0 : tensor<100x250xi8>, %arg1 : tensor<250x // CHECK-SAME: %[[ARG0:.+]]: tensor<100x250xi8> // CHECK-SAME: %[[ARG1:.+]]: tensor<250x500xi8> // CHECK-SAME: %[[ARG2:.+]]: tensor<100x500xi32> -// CHECK: %[[LHS_TILE_SIZE]]:2 = iree_encoding.upper_bound_tile_size tensor<100x250xi8, #iree_encoding.encoding> -> index, index -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0] high -// CHECK: tensor<100x250xi8> to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> -// CHECK: %[[RHS_TILE_SIZE]]:2 = iree_encoding.upper_bound_tile_size tensor<250x500xi8, #iree_encoding.encoding> -> index, index -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0] high -// CHECK: tensor<250x500xi8> to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> -// CHECK: %[[OUTS_TILE_SIZE]]:2 = iree_encoding.upper_bound_tile_size tensor<100x500xi32, #iree_encoding.encoding> -> index, index -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0] high -// CHECK: tensor<100x500xi32> to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor<100x250xi8, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor<250x500xi8, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor<100x500xi32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> // CHECK: %[[MATMUL:.+]] = linalg.matmul // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : -// CHECK: %[[RESULT_PADDED:.+]] = iree_encoding.unset_encoding %[[MATMUL]] -// CHECK: %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [100, 500] [1, 1] +// CHECK: %[[RESULT:.+]] = iree_encoding.unset_encoding %[[MATMUL]] // CHECK: util.return %[[RESULT]] // ----- @@ -163,26 +95,16 @@ util.func public @matmul_f16f16f32(%arg0 : tensor<100x250xf16>, %arg1 : tensor<2 // CHECK-SAME: %[[ARG0:.+]]: tensor<100x250xf16> // CHECK-SAME: %[[ARG1:.+]]: tensor<250x500xf16> // CHECK-SAME: %[[ARG2:.+]]: tensor<100x500xf32> -// CHECK: %[[LHS_TILE_SIZE]]:2 = iree_encoding.upper_bound_tile_size tensor<100x250xf16, #iree_encoding.encoding> -> index, index -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0] high -// CHECK: tensor<100x250xf16> to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> -// CHECK: %[[RHS_TILE_SIZE]]:2 = iree_encoding.upper_bound_tile_size tensor<250x500xf16, #iree_encoding.encoding> -> index, index -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0] high -// CHECK: tensor<250x500xf16> to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> -// CHECK: %[[OUTS_TILE_SIZE]]:2 = iree_encoding.upper_bound_tile_size tensor<100x500xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0] high -// CHECK: tensor<100x500xf32> to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor<100x250xf16, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor<250x500xf16, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor<100x500xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> // CHECK: %[[MATMUL:.+]] = linalg.matmul // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : -// CHECK: %[[RESULT_PADDED:.+]] = iree_encoding.unset_encoding %[[MATMUL]] -// CHECK: %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [100, 500] [1, 1] +// CHECK: %[[RESULT:.+]] = iree_encoding.unset_encoding %[[MATMUL]] // CHECK: util.return %[[RESULT]] // ----- @@ -200,26 +122,16 @@ util.func public @matmul_f16f16f16(%arg0 : tensor<100x250xf16>, %arg1 : tensor<2 // CHECK-SAME: %[[ARG0:.+]]: tensor<100x250xf16> // CHECK-SAME: %[[ARG1:.+]]: tensor<250x500xf16> // CHECK-SAME: %[[ARG2:.+]]: tensor<100x500xf16> -// CHECK: %[[LHS_TILE_SIZE]]:2 = iree_encoding.upper_bound_tile_size tensor<100x250xf16, #iree_encoding.encoding> -> index, index -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0] high -// CHECK: tensor<100x250xf16> to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> -// CHECK: %[[RHS_TILE_SIZE]]:2 = iree_encoding.upper_bound_tile_size tensor<250x500xf16, #iree_encoding.encoding> -> index, index -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0] high -// CHECK: tensor<250x500xf16> to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> -// CHECK: %[[OUTS_TILE_SIZE]]:2 = iree_encoding.upper_bound_tile_size tensor<100x500xf16, #iree_encoding.encoding> -> index, index -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0] high -// CHECK: tensor<100x500xf16> to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor<100x250xf16, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor<250x500xf16, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor<100x500xf16, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> // CHECK: %[[MATMUL:.+]] = linalg.matmul // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : -// CHECK: %[[RESULT_PADDED:.+]] = iree_encoding.unset_encoding %[[MATMUL]] -// CHECK: %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [100, 500] [1, 1] +// CHECK: %[[RESULT:.+]] = iree_encoding.unset_encoding %[[MATMUL]] // CHECK: util.return %[[RESULT]] // ----- @@ -237,26 +149,16 @@ util.func public @matmul_bf16bf16f32(%arg0 : tensor<100x250xbf16>, %arg1 : tenso // CHECK-SAME: %[[ARG0:.+]]: tensor<100x250xbf16> // CHECK-SAME: %[[ARG1:.+]]: tensor<250x500xbf16> // CHECK-SAME: %[[ARG2:.+]]: tensor<100x500xf32> -// CHECK: %[[LHS_TILE_SIZE]]:2 = iree_encoding.upper_bound_tile_size tensor<100x250xbf16, #iree_encoding.encoding> -> index, index -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0] high -// CHECK: tensor<100x250xbf16> to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> -// CHECK: %[[RHS_TILE_SIZE]]:2 = iree_encoding.upper_bound_tile_size tensor<250x500xbf16, #iree_encoding.encoding> -> index, index -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0] high -// CHECK: tensor<250x500xbf16> to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> -// CHECK: %[[OUTS_TILE_SIZE]]:2 = iree_encoding.upper_bound_tile_size tensor<100x500xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0] high -// CHECK: tensor<100x500xf32> to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor<100x250xbf16, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor<250x500xbf16, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor<100x500xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> // CHECK: %[[MATMUL:.+]] = linalg.matmul // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : -// CHECK: %[[RESULT_PADDED:.+]] = iree_encoding.unset_encoding %[[MATMUL]] -// CHECK: %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [100, 500] [1, 1] +// CHECK: %[[RESULT:.+]] = iree_encoding.unset_encoding %[[MATMUL]] // CHECK: util.return %[[RESULT]] // ----- @@ -274,26 +176,16 @@ util.func public @matmul_bf16bf16bf16(%arg0 : tensor<100x250xbf16>, %arg1 : tens // CHECK-SAME: %[[ARG0:.+]]: tensor<100x250xbf16> // CHECK-SAME: %[[ARG1:.+]]: tensor<250x500xbf16> // CHECK-SAME: %[[ARG2:.+]]: tensor<100x500xbf16> -// CHECK: %[[LHS_TILE_SIZE]]:2 = iree_encoding.upper_bound_tile_size tensor<100x250xbf16, #iree_encoding.encoding> -> index, index -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0] high -// CHECK: tensor<100x250xbf16> to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> -// CHECK: %[[RHS_TILE_SIZE]]:2 = iree_encoding.upper_bound_tile_size tensor<250x500xbf16, #iree_encoding.encoding> -> index, index -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0] high -// CHECK: tensor<250x500xbf16> to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> -// CHECK: %[[OUTS_TILE_SIZE]]:2 = iree_encoding.upper_bound_tile_size tensor<100x500xbf16, #iree_encoding.encoding> -> index, index -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0] high -// CHECK: tensor<100x500xbf16> to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor<100x250xbf16, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor<250x500xbf16, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor<100x500xbf16, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> // CHECK: %[[MATMUL:.+]] = linalg.matmul // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : -// CHECK: %[[RESULT_PADDED:.+]] = iree_encoding.unset_encoding %[[MATMUL]] -// CHECK: %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [100, 500] [1, 1] +// CHECK: %[[RESULT:.+]] = iree_encoding.unset_encoding %[[MATMUL]] // CHECK: util.return %[[RESULT]] // ----- @@ -304,7 +196,6 @@ util.func public @batch_matmul_f32f32f32(%arg0 : tensor<64x100x250xf32>, %arg1 : outs(%arg2 : tensor<64x100x500xf32>) -> tensor<64x100x500xf32> util.return %0 : tensor<64x100x500xf32> } -// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> // CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> @@ -312,39 +203,16 @@ util.func public @batch_matmul_f32f32f32(%arg0 : tensor<64x100x250xf32>, %arg1 : // CHECK-SAME: %[[ARG0:.+]]: tensor<64x100x250xf32> // CHECK-SAME: %[[ARG1:.+]]: tensor<64x250x500xf32> // CHECK-SAME: %[[ARG2:.+]]: tensor<64x100x500xf32> -// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index -// CHECK-DAG: %[[C100:.+]] = arith.constant 100 : index -// CHECK-DAG: %[[C250:.+]] = arith.constant 250 : index -// CHECK-DAG: %[[C500:.+]] = arith.constant 500 : index -// CHECK: %[[LHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<64x100x250xf32, #iree_encoding.encoding> -> index, index, index -// CHECK: %[[LHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#0, %[[C64]]] -// CHECK: %[[LHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#1, %[[C100]]] -// CHECK: %[[LHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#2, %[[C250]]] -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0, 0] high[%[[LHS_PADDING_SIZE0]], %[[LHS_PADDING_SIZE1]], %[[LHS_PADDING_SIZE2]]] -// CHECK: tensor<64x100x250xf32> to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[RHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<64x250x500xf32, #iree_encoding.encoding> -> index, index, index -// CHECK: %[[RHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#0, %[[C64]]] -// CHECK: %[[RHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#1, %[[C250]]] -// CHECK: %[[RHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#2, %[[C500]]] -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0, 0] high[%[[RHS_PADDING_SIZE0]], %[[RHS_PADDING_SIZE1]], %[[RHS_PADDING_SIZE2]]] -// CHECK: tensor<64x250x500xf32> to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[OUTS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<64x100x500xf32, #iree_encoding.encoding> -> index, index, index -// CHECK: %[[OUTS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#0, %[[C64]]] -// CHECK: %[[OUTS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#1, %[[C100]]] -// CHECK: %[[OUTS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#2, %[[C500]]] -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0, 0] high[%[[OUTS_PADDING_SIZE0]], %[[OUTS_PADDING_SIZE1]], %[[OUTS_PADDING_SIZE2]]] -// CHECK: tensor<64x100x500xf32> to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor<64x100x250xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor<64x250x500xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor<64x100x500xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> // CHECK: %[[BATCH_MATMUL:.+]] = linalg.batch_matmul // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : -// CHECK: %[[RESULT_PADDED:.+]] = iree_encoding.unset_encoding %[[BATCH_MATMUL]] -// CHECK: %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0, 0] [64, 100, 500] [1, 1, 1] +// CHECK: %[[RESULT:.+]] = iree_encoding.unset_encoding %[[BATCH_MATMUL]] // CHECK: util.return %[[RESULT]] // ----- @@ -355,48 +223,17 @@ util.func public @batch_matmul_f32f32f32_dynamic(%arg0 : tensor, %arg outs(%arg2 : tensor) -> tensor util.return %0 : tensor } -// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> // CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> // CHECK: util.func public @batch_matmul_f32f32f32_dynamic( // CHECK-SAME: %[[ARG0:.+]]: tensor, %[[ARG1:.+]]: tensor, %[[ARG2:.+]]: tensor -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK: %[[LHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor> -> index, index, index -// CHECK: %[[LHS_DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]] -// CHECK: %[[LHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#0, %[[LHS_DIM0]]] -// CHECK: %[[LHS_DIM1:.+]] = tensor.dim %[[ARG0]], %[[C1]] -// CHECK: %[[LHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#1, %[[LHS_DIM1]]] -// CHECK: %[[LHS_DIM2:.+]] = tensor.dim %[[ARG0]], %[[C2]] -// CHECK: %[[LHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#2, %[[LHS_DIM2]]] -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0, 0] high[%[[LHS_PADDING_SIZE0]], %[[LHS_PADDING_SIZE1]], %[[LHS_PADDING_SIZE2]]] -// CHECK: tensor to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor> -// CHECK: %[[RHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor> -> index, index, index -// CHECK: %[[RHS_DIM0:.+]] = tensor.dim %[[ARG1]], %[[C0]] -// CHECK: %[[RHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#0, %[[RHS_DIM0]]] -// CHECK: %[[RHS_DIM1:.+]] = tensor.dim %[[ARG1]], %[[C1]] -// CHECK: %[[RHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#1, %[[RHS_DIM1]]] -// CHECK: %[[RHS_DIM2:.+]] = tensor.dim %[[ARG1]], %[[C2]] -// CHECK: %[[RHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#2, %[[RHS_DIM2]]] -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0, 0] high[%[[RHS_PADDING_SIZE0]], %[[RHS_PADDING_SIZE1]], %[[RHS_PADDING_SIZE2]]] -// CHECK: tensor to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor> -// CHECK: %[[OUTS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor> -> index, index, index -// CHECK: %[[OUTS_DIM0:.+]] = tensor.dim %[[ARG2]], %[[C0]] -// CHECK: %[[OUTS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#0, %[[OUTS_DIM0]]] -// CHECK: %[[OUTS_DIM1:.+]] = tensor.dim %[[ARG2]], %[[C1]] -// CHECK: %[[OUTS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#1, %[[OUTS_DIM1]]] -// CHECK: %[[OUTS_DIM2:.+]] = tensor.dim %[[ARG2]], %[[C2]] -// CHECK: %[[OUTS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#2, %[[OUTS_DIM2]]] -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0, 0] high[%[[OUTS_PADDING_SIZE0]], %[[OUTS_PADDING_SIZE1]], %[[OUTS_PADDING_SIZE2]]] -// CHECK: tensor to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> // CHECK: %[[BATCH_MATMUL:.+]] = linalg.batch_matmul // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : @@ -412,7 +249,6 @@ util.func public @batch_matmul_f16f16f16(%arg0 : tensor<64x100x250xf16>, %arg1 : outs(%arg2 : tensor<64x100x500xf16>) -> tensor<64x100x500xf16> util.return %0 : tensor<64x100x500xf16> } -// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> // CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> @@ -420,39 +256,16 @@ util.func public @batch_matmul_f16f16f16(%arg0 : tensor<64x100x250xf16>, %arg1 : // CHECK-SAME: %[[ARG0:.+]]: tensor<64x100x250xf16> // CHECK-SAME: %[[ARG1:.+]]: tensor<64x250x500xf16> // CHECK-SAME: %[[ARG2:.+]]: tensor<64x100x500xf16> -// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index -// CHECK-DAG: %[[C100:.+]] = arith.constant 100 : index -// CHECK-DAG: %[[C250:.+]] = arith.constant 250 : index -// CHECK-DAG: %[[C500:.+]] = arith.constant 500 : index -// CHECK: %[[LHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<64x100x250xf16, #iree_encoding.encoding> -> index, index, index -// CHECK: %[[LHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#0, %[[C64]]] -// CHECK: %[[LHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#1, %[[C100]]] -// CHECK: %[[LHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#2, %[[C250]]] -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0, 0] high[%[[LHS_PADDING_SIZE0]], %[[LHS_PADDING_SIZE1]], %[[LHS_PADDING_SIZE2]]] -// CHECK: tensor<64x100x250xf16> to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[RHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<64x250x500xf16, #iree_encoding.encoding> -> index, index, index -// CHECK: %[[RHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#0, %[[C64]]] -// CHECK: %[[RHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#1, %[[C250]]] -// CHECK: %[[RHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#2, %[[C500]]] -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0, 0] high[%[[RHS_PADDING_SIZE0]], %[[RHS_PADDING_SIZE1]], %[[RHS_PADDING_SIZE2]]] -// CHECK: tensor<64x250x500xf16> to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[OUTS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<64x100x500xf16, #iree_encoding.encoding> -> index, index, index -// CHECK: %[[OUTS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#0, %[[C64]]] -// CHECK: %[[OUTS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#1, %[[C100]]] -// CHECK: %[[OUTS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#2, %[[C500]]] -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0, 0] high[%[[OUTS_PADDING_SIZE0]], %[[OUTS_PADDING_SIZE1]], %[[OUTS_PADDING_SIZE2]]] -// CHECK: tensor<64x100x500xf16> to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor<64x100x250xf16, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor<64x250x500xf16, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor<64x100x500xf16, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> // CHECK: %[[BATCH_MATMUL:.+]] = linalg.batch_matmul // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : -// CHECK: %[[RESULT_PADDED:.+]] = iree_encoding.unset_encoding %[[BATCH_MATMUL]] -// CHECK: %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0, 0] [64, 100, 500] [1, 1, 1] +// CHECK: %[[RESULT:.+]] = iree_encoding.unset_encoding %[[BATCH_MATMUL]] // CHECK: util.return %[[RESULT]] // ----- @@ -463,7 +276,6 @@ util.func public @batch_matmul_f16f16f32(%arg0 : tensor<64x100x250xf16>, %arg1 : outs(%arg2 : tensor<64x100x500xf32>) -> tensor<64x100x500xf32> util.return %0 : tensor<64x100x500xf32> } -// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> // CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> @@ -471,39 +283,16 @@ util.func public @batch_matmul_f16f16f32(%arg0 : tensor<64x100x250xf16>, %arg1 : // CHECK-SAME: %[[ARG0:.+]]: tensor<64x100x250xf16> // CHECK-SAME: %[[ARG1:.+]]: tensor<64x250x500xf16> // CHECK-SAME: %[[ARG2:.+]]: tensor<64x100x500xf32> -// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index -// CHECK-DAG: %[[C100:.+]] = arith.constant 100 : index -// CHECK-DAG: %[[C250:.+]] = arith.constant 250 : index -// CHECK-DAG: %[[C500:.+]] = arith.constant 500 : index -// CHECK: %[[LHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<64x100x250xf16, #iree_encoding.encoding> -> index, index, index -// CHECK: %[[LHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#0, %[[C64]]] -// CHECK: %[[LHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#1, %[[C100]]] -// CHECK: %[[LHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#2, %[[C250]]] -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0, 0] high[%[[LHS_PADDING_SIZE0]], %[[LHS_PADDING_SIZE1]], %[[LHS_PADDING_SIZE2]]] -// CHECK: tensor<64x100x250xf16> to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[RHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<64x250x500xf16, #iree_encoding.encoding> -> index, index, index -// CHECK: %[[RHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#0, %[[C64]]] -// CHECK: %[[RHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#1, %[[C250]]] -// CHECK: %[[RHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#2, %[[C500]]] -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0, 0] high[%[[RHS_PADDING_SIZE0]], %[[RHS_PADDING_SIZE1]], %[[RHS_PADDING_SIZE2]]] -// CHECK: tensor<64x250x500xf16> to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[OUTS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<64x100x500xf32, #iree_encoding.encoding> -> index, index, index -// CHECK: %[[OUTS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#0, %[[C64]]] -// CHECK: %[[OUTS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#1, %[[C100]]] -// CHECK: %[[OUTS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#2, %[[C500]]] -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0, 0] high[%[[OUTS_PADDING_SIZE0]], %[[OUTS_PADDING_SIZE1]], %[[OUTS_PADDING_SIZE2]]] -// CHECK: tensor<64x100x500xf32> to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor<64x100x250xf16, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor<64x250x500xf16, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor<64x100x500xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> // CHECK: %[[BATCH_MATMUL:.+]] = linalg.batch_matmul // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : -// CHECK: %[[RESULT_PADDED:.+]] = iree_encoding.unset_encoding %[[BATCH_MATMUL]] -// CHECK: %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0, 0] [64, 100, 500] [1, 1, 1] +// CHECK: %[[RESULT:.+]] = iree_encoding.unset_encoding %[[BATCH_MATMUL]] // CHECK: util.return %[[RESULT]] // ----- @@ -514,7 +303,6 @@ util.func public @batch_matmul_bf16bf16bf16(%arg0 : tensor<64x100x250xbf16>, %ar outs(%arg2 : tensor<64x100x500xbf16>) -> tensor<64x100x500xbf16> util.return %0 : tensor<64x100x500xbf16> } -// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> // CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> @@ -522,39 +310,16 @@ util.func public @batch_matmul_bf16bf16bf16(%arg0 : tensor<64x100x250xbf16>, %ar // CHECK-SAME: %[[ARG0:.+]]: tensor<64x100x250xbf16> // CHECK-SAME: %[[ARG1:.+]]: tensor<64x250x500xbf16> // CHECK-SAME: %[[ARG2:.+]]: tensor<64x100x500xbf16> -// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index -// CHECK-DAG: %[[C100:.+]] = arith.constant 100 : index -// CHECK-DAG: %[[C250:.+]] = arith.constant 250 : index -// CHECK-DAG: %[[C500:.+]] = arith.constant 500 : index -// CHECK: %[[LHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<64x100x250xbf16, #iree_encoding.encoding> -> index, index, index -// CHECK: %[[LHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#0, %[[C64]]] -// CHECK: %[[LHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#1, %[[C100]]] -// CHECK: %[[LHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#2, %[[C250]]] -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0, 0] high[%[[LHS_PADDING_SIZE0]], %[[LHS_PADDING_SIZE1]], %[[LHS_PADDING_SIZE2]]] -// CHECK: tensor<64x100x250xbf16> to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[RHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<64x250x500xbf16, #iree_encoding.encoding> -> index, index, index -// CHECK: %[[RHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#0, %[[C64]]] -// CHECK: %[[RHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#1, %[[C250]]] -// CHECK: %[[RHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#2, %[[C500]]] -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0, 0] high[%[[RHS_PADDING_SIZE0]], %[[RHS_PADDING_SIZE1]], %[[RHS_PADDING_SIZE2]]] -// CHECK: tensor<64x250x500xbf16> to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[OUTS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<64x100x500xbf16, #iree_encoding.encoding> -> index, index, index -// CHECK: %[[OUTS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#0, %[[C64]]] -// CHECK: %[[OUTS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#1, %[[C100]]] -// CHECK: %[[OUTS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#2, %[[C500]]] -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0, 0] high[%[[OUTS_PADDING_SIZE0]], %[[OUTS_PADDING_SIZE1]], %[[OUTS_PADDING_SIZE2]]] -// CHECK: tensor<64x100x500xbf16> to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor<64x100x250xbf16, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor<64x250x500xbf16, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor<64x100x500xbf16, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> // CHECK: %[[BATCH_MATMUL:.+]] = linalg.batch_matmul // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : -// CHECK: %[[RESULT_PADDED:.+]] = iree_encoding.unset_encoding %[[BATCH_MATMUL]] -// CHECK: %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0, 0] [64, 100, 500] [1, 1, 1] +// CHECK: %[[RESULT:.+]] = iree_encoding.unset_encoding %[[BATCH_MATMUL]] // CHECK: util.return %[[RESULT]] // ----- @@ -565,7 +330,6 @@ util.func public @batch_matmul_bf16bf16f32(%arg0 : tensor<64x100x250xbf16>, %arg outs(%arg2 : tensor<64x100x500xf32>) -> tensor<64x100x500xf32> util.return %0 : tensor<64x100x500xf32> } -// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> // CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> @@ -573,39 +337,16 @@ util.func public @batch_matmul_bf16bf16f32(%arg0 : tensor<64x100x250xbf16>, %arg // CHECK-SAME: %[[ARG0:.+]]: tensor<64x100x250xbf16> // CHECK-SAME: %[[ARG1:.+]]: tensor<64x250x500xbf16> // CHECK-SAME: %[[ARG2:.+]]: tensor<64x100x500xf32> -// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index -// CHECK-DAG: %[[C100:.+]] = arith.constant 100 : index -// CHECK-DAG: %[[C250:.+]] = arith.constant 250 : index -// CHECK-DAG: %[[C500:.+]] = arith.constant 500 : index -// CHECK: %[[LHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<64x100x250xbf16, #iree_encoding.encoding> -> index, index, index -// CHECK: %[[LHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#0, %[[C64]]] -// CHECK: %[[LHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#1, %[[C100]]] -// CHECK: %[[LHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#2, %[[C250]]] -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0, 0] high[%[[LHS_PADDING_SIZE0]], %[[LHS_PADDING_SIZE1]], %[[LHS_PADDING_SIZE2]]] -// CHECK: tensor<64x100x250xbf16> to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[RHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<64x250x500xbf16, #iree_encoding.encoding> -> index, index, index -// CHECK: %[[RHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#0, %[[C64]]] -// CHECK: %[[RHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#1, %[[C250]]] -// CHECK: %[[RHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#2, %[[C500]]] -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0, 0] high[%[[RHS_PADDING_SIZE0]], %[[RHS_PADDING_SIZE1]], %[[RHS_PADDING_SIZE2]]] -// CHECK: tensor<64x250x500xbf16> to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[OUTS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<64x100x500xf32, #iree_encoding.encoding> -> index, index, index -// CHECK: %[[OUTS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#0, %[[C64]]] -// CHECK: %[[OUTS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#1, %[[C100]]] -// CHECK: %[[OUTS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#2, %[[C500]]] -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0, 0] high[%[[OUTS_PADDING_SIZE0]], %[[OUTS_PADDING_SIZE1]], %[[OUTS_PADDING_SIZE2]]] -// CHECK: tensor<64x100x500xf32> to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor<64x100x250xbf16, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor<64x250x500xbf16, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor<64x100x500xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> // CHECK: %[[BATCH_MATMUL:.+]] = linalg.batch_matmul // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : -// CHECK: %[[RESULT_PADDED:.+]] = iree_encoding.unset_encoding %[[BATCH_MATMUL]] -// CHECK: %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0, 0] [64, 100, 500] [1, 1, 1] +// CHECK: %[[RESULT:.+]] = iree_encoding.unset_encoding %[[BATCH_MATMUL]] // CHECK: util.return %[[RESULT]] // ----- @@ -616,7 +357,6 @@ util.func public @batch_matmul_i8i8i32(%arg0 : tensor<64x100x250xi8>, %arg1 : te outs(%arg2 : tensor<64x100x500xi32>) -> tensor<64x100x500xi32> util.return %0 : tensor<64x100x500xi32> } -// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> // CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> @@ -624,39 +364,16 @@ util.func public @batch_matmul_i8i8i32(%arg0 : tensor<64x100x250xi8>, %arg1 : te // CHECK-SAME: %[[ARG0:.+]]: tensor<64x100x250xi8> // CHECK-SAME: %[[ARG1:.+]]: tensor<64x250x500xi8> // CHECK-SAME: %[[ARG2:.+]]: tensor<64x100x500xi32> -// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : index -// CHECK-DAG: %[[C100:.+]] = arith.constant 100 : index -// CHECK-DAG: %[[C250:.+]] = arith.constant 250 : index -// CHECK-DAG: %[[C500:.+]] = arith.constant 500 : index -// CHECK: %[[LHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<64x100x250xi8, #iree_encoding.encoding> -> index, index, index -// CHECK: %[[LHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#0, %[[C64]]] -// CHECK: %[[LHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#1, %[[C100]]] -// CHECK: %[[LHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#2, %[[C250]]] -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0, 0] high[%[[LHS_PADDING_SIZE0]], %[[LHS_PADDING_SIZE1]], %[[LHS_PADDING_SIZE2]]] -// CHECK: tensor<64x100x250xi8> to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[RHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<64x250x500xi8, #iree_encoding.encoding> -> index, index, index -// CHECK: %[[RHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#0, %[[C64]]] -// CHECK: %[[RHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#1, %[[C250]]] -// CHECK: %[[RHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#2, %[[C500]]] -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0, 0] high[%[[RHS_PADDING_SIZE0]], %[[RHS_PADDING_SIZE1]], %[[RHS_PADDING_SIZE2]]] -// CHECK: tensor<64x250x500xi8> to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[OUTS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<64x100x500xi32, #iree_encoding.encoding> -> index, index, index -// CHECK: %[[OUTS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#0, %[[C64]]] -// CHECK: %[[OUTS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#1, %[[C100]]] -// CHECK: %[[OUTS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#2, %[[C500]]] -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0, 0] high[%[[OUTS_PADDING_SIZE0]], %[[OUTS_PADDING_SIZE1]], %[[OUTS_PADDING_SIZE2]]] -// CHECK: tensor<64x100x500xi32> to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor<64x100x250xi8, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor<64x250x500xi8, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor<64x100x500xi32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> // CHECK: %[[BATCH_MATMUL:.+]] = linalg.batch_matmul // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : -// CHECK: %[[RESULT_PADDED:.+]] = iree_encoding.unset_encoding %[[BATCH_MATMUL]] -// CHECK: %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0, 0] [64, 100, 500] [1, 1, 1] +// CHECK: %[[RESULT:.+]] = iree_encoding.unset_encoding %[[BATCH_MATMUL]] // CHECK: util.return %[[RESULT]] // ----- @@ -667,7 +384,6 @@ util.func public @vecmat_f32f32f32(%arg0 : tensor<250xf32>, %arg1 : tensor<250x5 outs(%arg2 : tensor<500xf32>) -> tensor<500xf32> util.return %0 : tensor<500xf32> } -// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1) -> (d1)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1) -> (d1, d0)> // CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1) -> (d0)> @@ -675,32 +391,16 @@ util.func public @vecmat_f32f32f32(%arg0 : tensor<250xf32>, %arg1 : tensor<250x5 // CHECK-SAME: %[[ARG0:.+]]: tensor<250xf32> // CHECK-SAME: %[[ARG1:.+]]: tensor<250x500xf32> // CHECK-SAME: %[[ARG2:.+]]: tensor<500xf32> -// CHECK-DAG: %[[C250:.+]] = arith.constant 250 : index -// CHECK-DAG: %[[C500:.+]] = arith.constant 500 : index -// CHECK: %[[LHS_TILE_SIZE:.+]] = iree_encoding.upper_bound_tile_size tensor<250xf32, #iree_encoding.encoding> -> index -// CHECK: %[[LHS_PADDING_SIZE:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]], %[[C250]]] -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0] high[%[[LHS_PADDING_SIZE]]] -// CHECK: tensor<250xf32> to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[RHS_TILE_SIZE:.+]]:2 = iree_encoding.upper_bound_tile_size tensor<250x500xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[RHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#0, %[[C250]]] -// CHECK: %[[RHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#1, %[[C500]]] -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0] high[%[[RHS_PADDING_SIZE0]], %[[RHS_PADDING_SIZE1]]] -// CHECK: tensor<250x500xf32> to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[OUTS_TILE_SIZE:.+]] = iree_encoding.upper_bound_tile_size tensor<500xf32, #iree_encoding.encoding> -> index -// CHECK: %[[OUTS_PADDING_SIZE:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]], %[[C500]]] -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0] high[%[[OUTS_PADDING_SIZE]]] -// CHECK: tensor<500xf32> to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor<250xf32, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor<250x500xf32, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor<500xf32, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> // CHECK: %[[VECMAT:.+]] = linalg.vecmat // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : -// CHECK: %[[RESULT_PADDED:.+]] = iree_encoding.unset_encoding %[[VECMAT]] -// CHECK: %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0] [500] [1] +// CHECK: %[[RESULT:.+]] = iree_encoding.unset_encoding %[[VECMAT]] // CHECK: util.return %[[RESULT]] // ----- @@ -711,7 +411,6 @@ util.func public @matvec_f32f32f32(%arg0 : tensor<100x250xf32>, %arg1 : tensor<2 outs(%arg2 : tensor<100xf32>) -> tensor<100xf32> util.return %0 : tensor<100xf32> } -// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1) -> (d0, d1)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1) -> (d1)> // CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1) -> (d0)> @@ -719,32 +418,16 @@ util.func public @matvec_f32f32f32(%arg0 : tensor<100x250xf32>, %arg1 : tensor<2 // CHECK-SAME: %[[ARG0:.+]]: tensor<100x250xf32> // CHECK-SAME: %[[ARG1:.+]]: tensor<250xf32> // CHECK-SAME: %[[ARG2:.+]]: tensor<100xf32> -// CHECK-DAG: %[[C100:.+]] = arith.constant 100 : index -// CHECK-DAG: %[[C250:.+]] = arith.constant 250 : index -// CHECK: %[[LHS_TILE_SIZE:.+]]:2 = iree_encoding.upper_bound_tile_size tensor<100x250xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[LHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#0, %[[C100]]] -// CHECK: %[[LHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#1, %[[C250]]] -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0] high[%[[LHS_PADDING_SIZE0]], %[[LHS_PADDING_SIZE1]]] -// CHECK: tensor<100x250xf32> to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[RHS_TILE_SIZE:.+]] = iree_encoding.upper_bound_tile_size tensor<250xf32, #iree_encoding.encoding> -> index -// CHECK: %[[RHS_PADDING_SIZE:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]], %[[C250]]] -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0] high[%[[RHS_PADDING_SIZE]]] -// CHECK: tensor<250xf32> to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[OUTS_TILE_SIZE:.+]] = iree_encoding.upper_bound_tile_size tensor<100xf32, #iree_encoding.encoding> -> index -// CHECK: %[[OUTS_PADDING_SIZE:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]], %[[C100]]] -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0] high[%[[OUTS_PADDING_SIZE]]] -// CHECK: tensor<100xf32> to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor<100x250xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor<250xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor<100xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> // CHECK: %[[MATVEC:.+]] = linalg.matvec // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : -// CHECK: %[[RESULT_PADDED:.+]] = iree_encoding.unset_encoding %[[MATVEC]] -// CHECK: %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0] [100] [1] +// CHECK: %[[RESULT:.+]] = iree_encoding.unset_encoding %[[MATVEC]] // CHECK: util.return %[[RESULT]] // ----- @@ -755,7 +438,6 @@ util.func public @batch_vecmat_f32f32f32(%arg0 : tensor<3x250xf32>, %arg1 : tens outs(%arg2 : tensor<3x500xf32>) -> tensor<3x500xf32> util.return %0 : tensor<3x500xf32> } -// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d2, d1)> // CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -763,36 +445,16 @@ util.func public @batch_vecmat_f32f32f32(%arg0 : tensor<3x250xf32>, %arg1 : tens // CHECK-SAME: %[[ARG0:.+]]: tensor<3x250xf32> // CHECK-SAME: %[[ARG1:.+]]: tensor<3x250x500xf32> // CHECK-SAME: %[[ARG2:.+]]: tensor<3x500xf32> -// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index -// CHECK-DAG: %[[C250:.+]] = arith.constant 250 : index -// CHECK-DAG: %[[C500:.+]] = arith.constant 500 : index -// CHECK: %[[LHS_TILE_SIZE:.+]]:2 = iree_encoding.upper_bound_tile_size tensor<3x250xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[LHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#0, %[[C3]]] -// CHECK: %[[LHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#1, %[[C250]]] -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0] high[%[[LHS_PADDING_SIZE0]], %[[LHS_PADDING_SIZE1]]] -// CHECK: tensor<3x250xf32> to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[RHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<3x250x500xf32, #iree_encoding.encoding> -> index, index, index -// CHECK: %[[RHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#0, %[[C3]]] -// CHECK: %[[RHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#1, %[[C250]]] -// CHECK: %[[RHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#2, %[[C500]]] -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0, 0] high[%[[RHS_PADDING_SIZE0]], %[[RHS_PADDING_SIZE1]], %[[RHS_PADDING_SIZE2]]] -// CHECK: tensor<3x250x500xf32> to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[OUTS_TILE_SIZE:.+]]:2 = iree_encoding.upper_bound_tile_size tensor<3x500xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[OUTS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#0, %[[C3]]] -// CHECK: %[[OUTS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#1, %[[C500]]] -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0] high[%[[OUTS_PADDING_SIZE0]], %[[OUTS_PADDING_SIZE1]]] -// CHECK: tensor<3x500xf32> to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor<3x250xf32, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor<3x250x500xf32, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor<3x500xf32, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> // CHECK: %[[VECMAT:.+]] = linalg.batch_vecmat // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : -// CHECK: %[[RESULT_PADDED:.+]] = iree_encoding.unset_encoding %[[VECMAT]] -// CHECK: %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [3, 500] [1, 1] +// CHECK: %[[RESULT:.+]] = iree_encoding.unset_encoding %[[VECMAT]] // CHECK: util.return %[[RESULT]] // ----- @@ -803,44 +465,17 @@ util.func public @batch_matvec_f32f32f32_dynamic(%arg0 : tensor, %arg outs(%arg2 : tensor) -> tensor util.return %0 : tensor } -// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> // CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> // CHECK: util.func public @batch_matvec_f32f32f32_dynamic( // CHECK-SAME: %[[ARG0:.+]]: tensor, %[[ARG1:.+]]: tensor, %[[ARG2:.+]]: tensor -// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK: %[[LHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor> -> index, index, index -// CHECK: %[[LHS_DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]] -// CHECK: %[[LHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#0, %[[LHS_DIM0]]] -// CHECK: %[[LHS_DIM1:.+]] = tensor.dim %[[ARG0]], %[[C1]] -// CHECK: %[[LHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#1, %[[LHS_DIM1]]] -// CHECK: %[[LHS_DIM2:.+]] = tensor.dim %[[ARG0]], %[[C2]] -// CHECK: %[[LHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#2, %[[LHS_DIM2]]] -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0, 0] high[%[[LHS_PADDING_SIZE0]], %[[LHS_PADDING_SIZE1]], %[[LHS_PADDING_SIZE2]]] -// CHECK: tensor to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor> -// CHECK: %[[RHS_TILE_SIZE:.+]]:2 = iree_encoding.upper_bound_tile_size tensor> -> index, index -// CHECK: %[[RHS_DIM0:.+]] = tensor.dim %[[ARG1]], %[[C0]] -// CHECK: %[[RHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#0, %[[RHS_DIM0]]] -// CHECK: %[[RHS_DIM1:.+]] = tensor.dim %[[ARG1]], %[[C1]] -// CHECK: %[[RHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#1, %[[RHS_DIM1]]] -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0] high[%[[RHS_PADDING_SIZE0]], %[[RHS_PADDING_SIZE1]]] -// CHECK: tensor to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor> -// CHECK: %[[OUTS_TILE_SIZE:.+]]:2 = iree_encoding.upper_bound_tile_size tensor> -> index, index -// CHECK: %[[OUTS_DIM0:.+]] = tensor.dim %[[ARG2]], %[[C0]] -// CHECK: %[[OUTS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#0, %[[OUTS_DIM0]]] -// CHECK: %[[OUTS_DIM1:.+]] = tensor.dim %[[ARG2]], %[[C1]] -// CHECK: %[[OUTS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#1, %[[OUTS_DIM1]]] -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0] high[%[[OUTS_PADDING_SIZE0]], %[[OUTS_PADDING_SIZE1]]] -// CHECK: tensor to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> // CHECK: %[[BATCH_MATVEC:.+]] = linalg.batch_matvec // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : @@ -851,24 +486,24 @@ util.func public @batch_matvec_f32f32f32_dynamic(%arg0 : tensor, %arg // ----- util.func public @fold_fill_with_set_encoding(%arg0 : index, %arg1 : index) - -> tensor> { + -> tensor>> { %cst = arith.constant 0.0 : f32 %0 = tensor.empty(%arg0, %arg1) : tensor %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor) -> tensor %2 = iree_encoding.set_encoding %1 : tensor - -> tensor> - util.return %2 : tensor> + -> tensor>> + util.return %2 : tensor>> } // CHECK: util.func public @fold_fill_with_set_encoding( -// CHECK: %[[EMPTY:.+]] = tensor.empty(%{{.+}}, %{{.+}}) : tensor> +// CHECK: %[[EMPTY:.+]] = tensor.empty(%{{.+}}, %{{.+}}) : tensor>> // CHECK: %[[FILL:.+]] = linalg.fill -// CHECK-SAME: outs(%[[EMPTY]] : tensor>) +// CHECK-SAME: outs(%[[EMPTY]] : tensor>>) // CHECK: util.return %[[FILL]] // ----- util.func public @fold_fill_with_tensor_pad(%arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index) - -> tensor> { + -> tensor>> { %cst = arith.constant 0.0 : f32 %0 = tensor.empty(%arg0, %arg1) : tensor %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor) -> tensor @@ -877,12 +512,12 @@ util.func public @fold_fill_with_tensor_pad(%arg0 : index, %arg1 : index, %arg2 tensor.yield %cst : f32 } : tensor to tensor %3 = iree_encoding.set_encoding %2 : tensor - -> tensor> - util.return %3 : tensor> + -> tensor>> + util.return %3 : tensor>> } // CHECK: util.func public @fold_fill_with_tensor_pad( // CHECK: %[[EMPTY:.+]] = tensor.empty( -// CHECK-SAME: tensor> +// CHECK-SAME: tensor>> // CHECK: %[[FILL:.+]] = linalg.fill // CHECK-SAME: outs(%[[EMPTY]] : // CHECK: util.return %[[FILL]] @@ -960,9 +595,9 @@ util.func public @matmul_casted_from_i1_f32f32f32(%arg0 : tensor<64x256xi1>, // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> // CHECK: util.func public @matmul_casted_from_i1_f32f32f32 -// CHECK: set_encoding {{.+}} tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> -// CHECK: set_encoding {{.+}} tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> -// CHECK: set_encoding {{.+}} tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> +// CHECK: set_encoding {{.+}} tensor<64x256xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> +// CHECK: set_encoding {{.+}} tensor<256x128xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> +// CHECK: set_encoding {{.+}} tensor<64x128xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> // ----- @@ -988,9 +623,9 @@ util.func public @matmul_generic_casted_from_i1_f32f32f32(%arg0 : tensor<64x256x // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> // CHECK: util.func public @matmul_generic_casted_from_i1_f32f32f32 -// CHECK: set_encoding {{.+}} tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> -// CHECK: set_encoding {{.+}} tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> -// CHECK: set_encoding {{.+}} tensor, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]>> +// CHECK: set_encoding {{.+}} tensor<64x256xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> +// CHECK: set_encoding {{.+}} tensor<256x128xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> +// CHECK: set_encoding {{.+}} tensor<64x128xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> // ----- @@ -1004,9 +639,12 @@ util.func public @matmul_f32f32f32_narrow_M(%arg0 : tensor<2x250xf32>, %arg1 : t // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> // CHECK: util.func public @matmul_f32f32f32_narrow_M( -// CHECK: iree_encoding.upper_bound_tile_size tensor<2x250xf32, #iree_encoding.encoding> -// CHECK: iree_encoding.upper_bound_tile_size tensor<250x500xf32, #iree_encoding.encoding> -// CHECK: iree_encoding.upper_bound_tile_size tensor<2x500xf32, #iree_encoding.encoding> +// CHECK: iree_encoding.set_encoding +// CHECK-SAME: tensor<2x250xf32, #iree_encoding.encoding, matmul_narrow_M = 2 : index, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> +// CHECK: iree_encoding.set_encoding +// CHECK-SAME: tensor<250x500xf32, #iree_encoding.encoding, matmul_narrow_M = 2 : index, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> +// CHECK: iree_encoding.set_encoding +// CHECK-SAME: tensor<2x500xf32, #iree_encoding.encoding, matmul_narrow_M = 2 : index, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> // CHECK: linalg.matmul // ----- @@ -1021,9 +659,12 @@ util.func public @batch_matmul_f32f32f32_narrow_MN(%arg0 : tensor<64x4x250xf32>, // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> // CHECK: util.func public @batch_matmul_f32f32f32_narrow_MN( -// CHECK: iree_encoding.upper_bound_tile_size tensor<64x4x250xf32, #iree_encoding.encoding> -// CHECK: iree_encoding.upper_bound_tile_size tensor<64x250x2xf32, #iree_encoding.encoding> -// CHECK: iree_encoding.upper_bound_tile_size tensor<64x4x2xf32, #iree_encoding.encoding> +// CHECK: iree_encoding.set_encoding +// CHECK-SAME: tensor<64x4x250xf32, #iree_encoding.encoding, matmul_narrow_N = 2 : index, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> +// CHECK: iree_encoding.set_encoding +// CHECK-SAME: tensor<64x250x2xf32, #iree_encoding.encoding, matmul_narrow_N = 2 : index, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> +// CHECK: iree_encoding.set_encoding +// CHECK-SAME: tensor<64x4x2xf32, #iree_encoding.encoding, matmul_narrow_N = 2 : index, user_indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], round_dims_to = array>> // CHECK: linalg.batch_matmul // ----- @@ -1035,7 +676,6 @@ util.func public @matmul_transpose_a_f32f32f32(%arg0 : tensor<250x100xf32>, %arg util.return %0 : tensor<100x500xf32> } -// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> // CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -1043,35 +683,16 @@ util.func public @matmul_transpose_a_f32f32f32(%arg0 : tensor<250x100xf32>, %arg // CHECK-SAME: %[[ARG0:.+]]: tensor<250x100xf32> // CHECK-SAME: %[[ARG1:.+]]: tensor<250x500xf32> // CHECK-SAME: %[[ARG2:.+]]: tensor<100x500xf32> -// CHECK-DAG: %[[C100:.+]] = arith.constant 100 : index -// CHECK-DAG: %[[C250:.+]] = arith.constant 250 : index -// CHECK-DAG: %[[C500:.+]] = arith.constant 500 : index -// CHECK: %[[LHS_TILE_SIZE:.+]]:2 = iree_encoding.upper_bound_tile_size tensor<250x100xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[LHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#0, %[[C250]]] -// CHECK: %[[LHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#1, %[[C100]]] -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0] high[%[[LHS_PADDING_SIZE0]], %[[LHS_PADDING_SIZE1]]] -// CHECK: tensor<250x100xf32> to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[RHS_TILE_SIZE:.+]]:2 = iree_encoding.upper_bound_tile_size tensor<250x500xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[RHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#0, %[[C250]]] -// CHECK: %[[RHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#1, %[[C500]]] -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0] high[%[[RHS_PADDING_SIZE0]], %[[RHS_PADDING_SIZE1]]] -// CHECK: tensor<250x500xf32> to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[OUTS_TILE_SIZE:.+]]:2 = iree_encoding.upper_bound_tile_size tensor<100x500xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[OUTS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#0, %[[C100]]] -// CHECK: %[[OUTS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#1, %[[C500]]] -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0] high[%[[OUTS_PADDING_SIZE0]], %[[OUTS_PADDING_SIZE1]]] -// CHECK: tensor<100x500xf32> to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor<250x100xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor<250x500xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor<100x500xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> // CHECK: %[[MATMUL:.+]] = linalg.matmul_transpose_a // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : -// CHECK: %[[RESULT_PADDED:.+]] = iree_encoding.unset_encoding %[[MATMUL]] -// CHECK: %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [100, 500] [1, 1] +// CHECK: %[[RESULT:.+]] = iree_encoding.unset_encoding %[[MATMUL]] // CHECK: util.return %[[RESULT]] // ----- @@ -1082,7 +703,6 @@ util.func public @matmul_transpose_b_f32f32f32(%arg0 : tensor<100x250xf32>, %arg outs(%arg2 : tensor<100x500xf32>) -> tensor<100x500xf32> util.return %0 : tensor<100x500xf32> } -// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> // CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -1090,35 +710,16 @@ util.func public @matmul_transpose_b_f32f32f32(%arg0 : tensor<100x250xf32>, %arg // CHECK-SAME: %[[ARG0:.+]]: tensor<100x250xf32> // CHECK-SAME: %[[ARG1:.+]]: tensor<500x250xf32> // CHECK-SAME: %[[ARG2:.+]]: tensor<100x500xf32> -// CHECK-DAG: %[[C100:.+]] = arith.constant 100 : index -// CHECK-DAG: %[[C250:.+]] = arith.constant 250 : index -// CHECK-DAG: %[[C500:.+]] = arith.constant 500 : index -// CHECK: %[[LHS_TILE_SIZE:.+]]:2 = iree_encoding.upper_bound_tile_size tensor<100x250xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[LHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#0, %[[C100]]] -// CHECK: %[[LHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#1, %[[C250]]] -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0] high[%[[LHS_PADDING_SIZE0]], %[[LHS_PADDING_SIZE1]]] -// CHECK: tensor<100x250xf32> to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[RHS_TILE_SIZE:.+]]:2 = iree_encoding.upper_bound_tile_size tensor<500x250xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[RHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#0, %[[C500]]] -// CHECK: %[[RHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#1, %[[C250]]] -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0] high[%[[RHS_PADDING_SIZE0]], %[[RHS_PADDING_SIZE1]]] -// CHECK: tensor<500x250xf32> to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[OUTS_TILE_SIZE:.+]]:2 = iree_encoding.upper_bound_tile_size tensor<100x500xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[OUTS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#0, %[[C100]]] -// CHECK: %[[OUTS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#1, %[[C500]]] -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0] high[%[[OUTS_PADDING_SIZE0]], %[[OUTS_PADDING_SIZE1]]] -// CHECK: tensor<100x500xf32> to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor<100x250xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor<500x250xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor<100x500xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> // CHECK: %[[MATMUL:.+]] = linalg.matmul_transpose_b // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : -// CHECK: %[[RESULT_PADDED:.+]] = iree_encoding.unset_encoding %[[MATMUL]] -// CHECK: %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [100, 500] [1, 1] +// CHECK: %[[RESULT:.+]] = iree_encoding.unset_encoding %[[MATMUL]] // CHECK: util.return %[[RESULT]] // ----- @@ -1129,7 +730,6 @@ util.func public @batch_matmul_transpose_a_f32f32f32(%arg0 : tensor<2x250x100xf3 outs(%arg2 : tensor<2x100x500xf32>) -> tensor<2x100x500xf32> util.return %0 : tensor<2x100x500xf32> } -// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d1)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> // CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> @@ -1137,39 +737,16 @@ util.func public @batch_matmul_transpose_a_f32f32f32(%arg0 : tensor<2x250x100xf3 // CHECK-SAME: %[[ARG0:.+]]: tensor<2x250x100xf32> // CHECK-SAME: %[[ARG1:.+]]: tensor<2x250x500xf32> // CHECK-SAME: %[[ARG2:.+]]: tensor<2x100x500xf32> -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C100:.+]] = arith.constant 100 : index -// CHECK-DAG: %[[C250:.+]] = arith.constant 250 : index -// CHECK-DAG: %[[C500:.+]] = arith.constant 500 : index -// CHECK: %[[LHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<2x250x100xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[LHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#0, %[[C2]]] -// CHECK: %[[LHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#1, %[[C250]]] -// CHECK: %[[LHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#2, %[[C100]]] -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0, 0] high[%[[LHS_PADDING_SIZE0]], %[[LHS_PADDING_SIZE1]], %[[LHS_PADDING_SIZE2]]] -// CHECK: tensor<2x250x100xf32> to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[RHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<2x250x500xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[RHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#0, %[[C2]]] -// CHECK: %[[RHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#1, %[[C250]]] -// CHECK: %[[RHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#2, %[[C500]]] -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0, 0] high[%[[RHS_PADDING_SIZE0]], %[[RHS_PADDING_SIZE1]], %[[RHS_PADDING_SIZE2]]] -// CHECK: tensor<2x250x500xf32> to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[OUTS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<2x100x500xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[OUTS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#0, %[[C2]]] -// CHECK: %[[OUTS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#1, %[[C100]]] -// CHECK: %[[OUTS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#2, %[[C500]]] -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0, 0] high[%[[OUTS_PADDING_SIZE0]], %[[OUTS_PADDING_SIZE1]], %[[OUTS_PADDING_SIZE2]]] -// CHECK: tensor<2x100x500xf32> to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor<2x250x100xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor<2x250x500xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor<2x100x500xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> // CHECK: %[[BATCH_MATMUL:.+]] = linalg.batch_matmul_transpose_a // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : -// CHECK: %[[RESULT_PADDED:.+]] = iree_encoding.unset_encoding %[[BATCH_MATMUL]] -// CHECK: %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0, 0] [2, 100, 500] [1, 1, 1] +// CHECK: %[[RESULT:.+]] = iree_encoding.unset_encoding %[[BATCH_MATMUL]] // CHECK: util.return %[[RESULT]] // ----- @@ -1180,7 +757,6 @@ util.func public @batch_matmul_transpose_b_f32f32f32(%arg0 : tensor<2x100x250xf3 outs(%arg2 : tensor<2x100x500xf32>) -> tensor<2x100x500xf32> util.return %0 : tensor<2x100x500xf32> } -// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)> // CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> @@ -1188,39 +764,16 @@ util.func public @batch_matmul_transpose_b_f32f32f32(%arg0 : tensor<2x100x250xf3 // CHECK-SAME: %[[ARG0:.+]]: tensor<2x100x250xf32> // CHECK-SAME: %[[ARG1:.+]]: tensor<2x500x250xf32> // CHECK-SAME: %[[ARG2:.+]]: tensor<2x100x500xf32> -// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index -// CHECK-DAG: %[[C100:.+]] = arith.constant 100 : index -// CHECK-DAG: %[[C250:.+]] = arith.constant 250 : index -// CHECK-DAG: %[[C500:.+]] = arith.constant 500 : index -// CHECK: %[[LHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<2x100x250xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[LHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#0, %[[C2]]] -// CHECK: %[[LHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#1, %[[C100]]] -// CHECK: %[[LHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#2, %[[C250]]] -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0, 0] high[%[[LHS_PADDING_SIZE0]], %[[LHS_PADDING_SIZE1]], %[[LHS_PADDING_SIZE2]]] -// CHECK: tensor<2x100x250xf32> to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[RHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<2x500x250xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[RHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#0, %[[C2]]] -// CHECK: %[[RHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#1, %[[C500]]] -// CHECK: %[[RHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#2, %[[C250]]] -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0, 0] high[%[[RHS_PADDING_SIZE0]], %[[RHS_PADDING_SIZE1]], %[[RHS_PADDING_SIZE2]]] -// CHECK: tensor<2x500x250xf32> to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[OUTS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<2x100x500xf32, #iree_encoding.encoding> -> index, index -// CHECK: %[[OUTS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#0, %[[C2]]] -// CHECK: %[[OUTS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#1, %[[C100]]] -// CHECK: %[[OUTS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#2, %[[C500]]] -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0, 0] high[%[[OUTS_PADDING_SIZE0]], %[[OUTS_PADDING_SIZE1]], %[[OUTS_PADDING_SIZE2]]] -// CHECK: tensor<2x100x500xf32> to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor<2x100x250xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor<2x500x250xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor<2x100x500xf32, #iree_encoding.encoding, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> // CHECK: %[[BATCH_MATMUL:.+]] = linalg.batch_matmul_transpose_b // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : -// CHECK: %[[RESULT_PADDED:.+]] = iree_encoding.unset_encoding %[[BATCH_MATMUL]] -// CHECK: %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0, 0] [2, 100, 500] [1, 1, 1] +// CHECK: %[[RESULT:.+]] = iree_encoding.unset_encoding %[[BATCH_MATMUL]] // CHECK: util.return %[[RESULT]] // ----- @@ -1238,7 +791,6 @@ util.func public @generic_batch_vecmat_transposed_i16u4i32(%arg0 : tensor<32x128 util.return %0 : tensor<4096x32xi32> } -// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)> // CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -1246,38 +798,18 @@ util.func public @generic_batch_vecmat_transposed_i16u4i32(%arg0 : tensor<32x128 // CHECK-SAME: %[[ARG0:.+]]: tensor<32x128xi16> // CHECK-SAME: %[[ARG1:.+]]: tensor<4096x32x128xi4> // CHECK-SAME: %[[ARG2:.+]]: tensor<4096x32xi32> -// CHECK-DAG: %[[C4096:.+]] = arith.constant 4096 : index -// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index -// CHECK-DAG: %[[C32:.+]] = arith.constant 32 : index -// CHECK: %[[LHS_TILE_SIZE:.+]]:2 = iree_encoding.upper_bound_tile_size tensor<32x128xi16, #iree_encoding.encoding> -> index, index -// CHECK: %[[LHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#0, %[[C32]]] -// CHECK: %[[LHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[LHS_TILE_SIZE]]#1, %[[C128]]] -// CHECK: %[[LHS_PAD:.+]] = tensor.pad %[[ARG0]] low[0, 0] high[%[[LHS_PADDING_SIZE0]], %[[LHS_PADDING_SIZE1]]] -// CHECK: tensor<32x128xi16> to tensor -// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[LHS_PAD]] -// CHECK-SAME: tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[RHS_TILE_SIZE:.+]]:3 = iree_encoding.upper_bound_tile_size tensor<4096x32x128xi4, #iree_encoding.encoding> -> index, index, index -// CHECK: %[[RHS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#0, %[[C4096]]] -// CHECK: %[[RHS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#1, %[[C32]]] -// CHECK: %[[RHS_PADDING_SIZE2:.+]] = affine.apply #[[MAP]]()[%[[RHS_TILE_SIZE]]#2, %[[C128]]] -// CHECK: %[[RHS_PAD:.+]] = tensor.pad %[[ARG1]] low[0, 0, 0] high[%[[RHS_PADDING_SIZE0]], %[[RHS_PADDING_SIZE1]], %[[RHS_PADDING_SIZE2]]] -// CHECK: tensor<4096x32x128xi4> to tensor -// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[RHS_PAD]] -// CHECK-SAME: tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> -// CHECK: %[[OUTS_TILE_SIZE:.+]]:2 = iree_encoding.upper_bound_tile_size tensor<4096x32xi32, #iree_encoding.encoding> -> index, index -// CHECK: %[[OUTS_PADDING_SIZE0:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#0, %[[C4096]]] -// CHECK: %[[OUTS_PADDING_SIZE1:.+]] = affine.apply #[[MAP]]()[%[[OUTS_TILE_SIZE]]#1, %[[C32]]] -// CHECK: %[[OUTS_PAD:.+]] = tensor.pad %[[ARG2]] low[0, 0] high[%[[OUTS_PADDING_SIZE0]], %[[OUTS_PADDING_SIZE1]]] -// CHECK: tensor<4096x32xi32> to tensor -// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[OUTS_PAD]] -// CHECK-SAME: tensor -> tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]]>> +// CHECK: %[[LHS:.+]] = iree_encoding.set_encoding %[[ARG0]] +// CHECK-SAME: tensor<32x128xi16, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[RHS:.+]] = iree_encoding.set_encoding %[[ARG1]] +// CHECK-SAME: tensor<4096x32x128xi4, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> +// CHECK: %[[OUTS:.+]] = iree_encoding.set_encoding %[[ARG2]] +// CHECK-SAME: tensor<4096x32xi32> -> tensor<4096x32xi32, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]], round_dims_to = array>> // CHECK: %[[GENERIC:.+]] = linalg.generic // CHECK-SAME: indexing_maps = [#[[MAP1]], #[[MAP2]], #[[MAP3]]] // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction"] // CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : -// CHECK: %[[RESULT_PADDED:.+]] = iree_encoding.unset_encoding %[[GENERIC]] -// CHECK: %[[RESULT:.+]] = tensor.extract_slice %[[RESULT_PADDED]][0, 0] [4096, 32] [1, 1] +// CHECK: %[[RESULT:.+]] = iree_encoding.unset_encoding %[[GENERIC]] // CHECK: util.return %[[RESULT]] // -----