Skip to content

Commit

Permalink
[Codegen][GPU] Add pass to resolve scf.forall ops (#18394)
Browse files Browse the repository at this point in the history
This pass differs from the upstream pattern in that it supports dynamic
trip count forall loops as well as overprovisioning, however it has a
few restrictions with respect to indexing. It only supports linearized
thread indices, as well as loops where the iteration order matches the
order of fastest changing thread index (i.e. the outer most loop will be
assigned the slowest changing thread indices). These are practical
choices to avoid overcomplicating the implementation of this pass by
requiring us to normalize `scf.forall` operations before this pass. Such
a normalization pass is TODO.
  • Loading branch information
qedawkins committed Sep 11, 2024
1 parent 0dd358d commit 5a6521c
Show file tree
Hide file tree
Showing 14 changed files with 409 additions and 37 deletions.
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ iree_compiler_cc_library(
"GPUCombineValueBarriers.cpp",
"GPUCreateFastSlowPath.cpp",
"GPUDistribute.cpp",
"GPUDistributeForall.cpp",
"GPUDistributeScfFor.cpp",
"GPUDistributeSharedMemoryCopy.cpp",
"GPUDistributionPatterns.cpp",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ iree_cc_library(
"GPUCombineValueBarriers.cpp"
"GPUCreateFastSlowPath.cpp"
"GPUDistribute.cpp"
"GPUDistributeForall.cpp"
"GPUDistributeScfFor.cpp"
"GPUDistributeSharedMemoryCopy.cpp"
"GPUDistributionPatterns.cpp"
Expand Down
217 changes: 217 additions & 0 deletions compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
// Copyright 2024 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include <functional>
#include "iree/compiler/Codegen/Common/GPU/Passes.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
#include "mlir/Dialect/SCF/IR/SCF.h"

namespace mlir::iree_compiler {

#define GEN_PASS_DEF_GPUDISTRIBUTEFORALLPASS
#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"

namespace {

struct GPUDistributeForallPass final
: impl::GPUDistributeForallPassBase<GPUDistributeForallPass> {
void runOnOperation() override;
};
} // namespace

LogicalResult resolveGPUMappedForallOp(RewriterBase &rewriter,
scf::ForallOp forallOp,
Value linearThreadId,
int64_t flatWorkgroupSize,
int64_t subgroupSize) {

// Skip forall ops without mappings.
if (!forallOp.getMapping()) {
return success();
}

ArrayAttr mapping = forallOp.getMappingAttr();
bool hasThreadMapping =
llvm::all_of(mapping, llvm::IsaPred<gpu::GPUThreadMappingAttr>);
bool hasWarpMapping =
llvm::all_of(mapping, llvm::IsaPred<gpu::GPUWarpMappingAttr>);

// Skip forall ops that are not mapped to GPU ids.
if (!hasThreadMapping && !hasWarpMapping) {
return success();
}

if (forallOp->getNumResults() != 0) {
forallOp.emitOpError("Cannot distribute scf.forall op on tensors.");
return failure();
}

if (!isDescendingRelativeMappingIndices(mapping.getValue())) {
forallOp.emitOpError("Cannot distribute forall op with non-descending "
"relative iterator mapping");
return failure();
}

if (!llvm::all_of(mapping, [](Attribute attr) {
return cast<DeviceMappingAttrInterface>(attr).isLinearMapping();
})) {
forallOp.emitOpError("unimplemented: resolution of scf.forall ops without "
"linear id mappings.");
return failure();
}

if (!forallOp.isNormalized()) {
forallOp.emitOpError("scf.forall op must be normalized for distribution.");
return failure();
}

MLIRContext *context = rewriter.getContext();
Location loc = forallOp.getLoc();
AffineExpr d0, d1;
bindDims(context, d0, d1);

// Divide the thread ID by the subgroup size if this loop is mapped to
// subgroups.
assert(!(hasThreadMapping && hasWarpMapping));
Value flatId = linearThreadId;
if (hasWarpMapping) {
OpFoldResult subgroupSizeVal = rewriter.getIndexAttr(subgroupSize);
flatId = affine::makeComposedAffineApply(rewriter, loc, d0.floorDiv(d1),
{flatId, subgroupSizeVal});
}

SmallVector<Value> delinSizes;
OpFoldResult totalLoopTripCount = rewriter.getIndexAttr(1);
for (auto workerCount : forallOp.getMixedUpperBound()) {
delinSizes.push_back(
getValueOrCreateConstantIndexOp(rewriter, loc, workerCount));
totalLoopTripCount = affine::makeComposedFoldedAffineApply(
rewriter, loc, d0 * d1, {totalLoopTripCount, workerCount});
}

int64_t flatTotalNumWorkers =
hasWarpMapping ? flatWorkgroupSize / subgroupSize : flatWorkgroupSize;
std::optional<int64_t> staticProducerCount =
getConstantIntValue(totalLoopTripCount);
bool perfectlyDivides =
staticProducerCount &&
staticProducerCount.value() % flatTotalNumWorkers == 0;

// Step 3. Create the `scf.for` loop for the loop.
// If the workgroup count perfectly divides the loop's worker count, then we
// can use a lower bound of 0 and keep the loop bounds static. This helps
// simplify later loop folding patterns without an `affine.linearize_index` op
// to help with inferring int ranges.
Value lb = perfectlyDivides ? rewriter.create<arith::ConstantIndexOp>(loc, 0)
: flatId;
Value ub = getValueOrCreateConstantIndexOp(rewriter, loc, totalLoopTripCount);
Value step =
rewriter.create<arith::ConstantIndexOp>(loc, flatTotalNumWorkers);
auto forLoop = rewriter.create<scf::ForOp>(loc, lb, ub, step, ValueRange{});
Block *loopBody = forLoop.getBody();

// Get the replacement IDs for the forall iterator ids.
rewriter.setInsertionPointToStart(loopBody);
Value newFlatProducerId =
perfectlyDivides
? affine::makeComposedAffineApply(rewriter, loc, d0 + d1,
{forLoop.getInductionVar(), flatId})
: forLoop.getInductionVar();

// We require a descending relative mapping, so delinearize in reverse order.
auto delinearize = rewriter.create<affine::AffineDelinearizeIndexOp>(
loc, newFlatProducerId, llvm::to_vector(llvm::reverse(delinSizes)));

SmallVector<Value> newBlockArgs =
llvm::map_to_vector(llvm::reverse(delinearize.getResults()),
[](OpResult r) -> Value { return r; });

// Step 4. Inline the region of the forall op.
Operation *forallTerminator = forallOp.getBody()->getTerminator();
rewriter.inlineBlockBefore(forallOp.getBody(), loopBody->getTerminator(),
newBlockArgs);
rewriter.eraseOp(forallTerminator);
rewriter.eraseOp(forallOp);
return success();
}

void GPUDistributeForallPass::runOnOperation() {
auto funcOp = getOperation();

// First map all lane level forall loops to lanes.
IRRewriter rewriter(funcOp->getContext());
IREE::GPU::mapLaneForalls(rewriter, funcOp, /*insertBarrier=*/false);

SmallVector<scf::ForallOp> forallOps;
funcOp.walk([&](scf::ForallOp op) { forallOps.push_back(op); });
// Early exit if no more forall ops to distribute.
if (forallOps.empty()) {
return;
}

std::optional<SmallVector<int64_t>> maybeWorkgroupSize =
getWorkgroupSize(funcOp);
if (!maybeWorkgroupSize) {
funcOp.emitOpError(
"unimplemented: Distribution with dynamic workgroup size.");
return signalPassFailure();
}
SmallVector<int64_t> workgroupSize = maybeWorkgroupSize.value();

std::optional<int64_t> maybeSubgroupSize = getSubgroupSize(funcOp);
if (!maybeSubgroupSize) {
funcOp.emitOpError(
"unimplemented: Distribution with dynamic subgroup size.");
return signalPassFailure();
}

int64_t flatWorkgroupSize =
std::accumulate(workgroupSize.begin(), workgroupSize.end(), 1,
std::multiplies<int64_t>());
int64_t subgroupSize = *maybeSubgroupSize;

if (flatWorkgroupSize % subgroupSize != 0) {
funcOp.emitOpError(
"Invalid workgroup size is not divisible by subgroup size.");
return signalPassFailure();
}

AffineExpr x, y, z;
bindSymbols(funcOp.getContext(), x, y, z);
// Compute the linearized thread id.
AffineExpr linearId =
x + workgroupSize[0] * y + workgroupSize[1] * workgroupSize[0] * z;

rewriter.setInsertionPointToStart(&funcOp.getFunctionBody().front());
SmallVector<OpFoldResult> threadGrid = {
rewriter.createOrFold<gpu::ThreadIdOp>(funcOp.getLoc(),
gpu::Dimension::x),
rewriter.createOrFold<gpu::ThreadIdOp>(funcOp.getLoc(),
gpu::Dimension::y),
rewriter.createOrFold<gpu::ThreadIdOp>(funcOp.getLoc(),
gpu::Dimension::z)};

Value linearThreadIdVal = affine::makeComposedAffineApply(
rewriter, funcOp.getLoc(), linearId, threadGrid);
for (auto forall : forallOps) {
rewriter.setInsertionPoint(forall);
if (failed(resolveGPUMappedForallOp(rewriter, forall, linearThreadIdVal,
flatWorkgroupSize, subgroupSize))) {
return signalPassFailure();
}
}
}

} // namespace mlir::iree_compiler
13 changes: 12 additions & 1 deletion compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,20 @@ def GPUCreateFastSlowPathPass :
let dependentDialects = ["::mlir::scf::SCFDialect"];
}

def GPUDistributeForallPass :
InterfacePass<"iree-codegen-gpu-distribute-forall", "mlir::FunctionOpInterface"> {
let summary = "Pass to distribute scf.forall ops.";
let dependentDialects = [
"::mlir::affine::AffineDialect",
"::mlir::gpu::GPUDialect",
"::mlir::scf::SCFDialect",
"::mlir::iree_compiler::IREE::GPU::IREEGPUDialect",
];
}

def GPUDistributePass :
InterfacePass<"iree-codegen-gpu-distribute", "mlir::FunctionOpInterface"> {
let summary = "Pass to distribute scf.forall ops.";
let summary = "Pass to distribute scf.forall ops using upstream patterns.";
let dependentDialects = [
"::mlir::affine::AffineDialect",
"::mlir::gpu::GPUDialect",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ iree_lit_test_suite(
"gpu_check_resource_usage.mlir",
"gpu_create_fast_slow_path.mlir",
"gpu_distribute.mlir",
"gpu_distribute_forall.mlir",
"gpu_distribute_scf_for.mlir",
"gpu_distribute_shared_memory.mlir",
"gpu_generalize_named_ops.mlir",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ iree_lit_test_suite(
"gpu_combine_value_barriers.mlir"
"gpu_create_fast_slow_path.mlir"
"gpu_distribute.mlir"
"gpu_distribute_forall.mlir"
"gpu_distribute_scf_for.mlir"
"gpu_distribute_shared_memory.mlir"
"gpu_generalize_named_ops.mlir"
Expand Down
Loading

0 comments on commit 5a6521c

Please sign in to comment.