From 858c7b9c04123c96fffec247d63e2ee68312ab23 Mon Sep 17 00:00:00 2001
From: hanhanW <hanhan0912@gmail.com>
Date: Mon, 12 Aug 2024 11:34:47 -0700
Subject: [PATCH 1/2] move createXXXPass where it is defined (i.e., LLVMGPU/).

Signed-off-by: hanhanW <hanhan0912@gmail.com>
---
 compiler/src/iree/compiler/Codegen/Common/Passes.h  | 5 -----
 compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h | 4 ++++
 2 files changed, 4 insertions(+), 5 deletions(-)
diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.h b/compiler/src/iree/compiler/Codegen/Common/Passes.h
index b6db41b616f1..e9ca9d81f1be 100644
--- a/compiler/src/iree/compiler/Codegen/Common/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/Common/Passes.h
@@ -88,11 +88,6 @@ createTileAndDistributeToWorkgroupsPass(
     int32_t maxWorkgroupParallelDims,
     linalg::DistributionMethod distributionMethod);
 
-// TODO(hanchung): Move it where it is defined (i.e., Codegen/LLVMGPU).
-// Extract address computations (including the ones with GPU instructions) into
-// their own separate instructions.
-std::unique_ptr<Pass> createExtractAddressComputationGPUPass();
-
 //----------------------------------------------------------------------------//
 // CodeGen Common Patterns
 //----------------------------------------------------------------------------//
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
index fb8427502278..527317cfb253 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
@@ -130,6 +130,10 @@ createLLVMGPUCastTypeToFitMMAPass();
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
 createLLVMGPUDistribute();
 
+// Extract address computations (including the ones with GPU instructions) into
+// their own separate instructions.
+std::unique_ptr<Pass> createExtractAddressComputationGPUPass();
+
 /// Create pass selecting the lowering strategy for LLVMGPU.
 std::unique_ptr<OperationPass<ModuleOp>>
 createLLVMGPUSelectLoweringStrategyPass();

From 354ab2ee13a7c9ad0eb108e66d4bfbe7ec2aef49 Mon Sep 17 00:00:00 2001
From: hanhanW <hanhan0912@gmail.com>
Date: Tue, 13 Aug 2024 16:32:12 -0700
Subject: [PATCH 2/2] [LLVMGPU] Switch to new pass generation tablegen
 definitions.

This is mostly an NFC change. The revision applies a little cleanups:

- Switch a couple of passes to follow `create.*Pass` naming convention.

Signed-off-by: hanhanW <hanhan0912@gmail.com>
---
 .../LLVMGPU/AMDGPUChainedMatmulPass.cpp       |  15 +--
 .../iree/compiler/Codegen/LLVMGPU/BUILD.bazel |   2 -
 .../compiler/Codegen/LLVMGPU/CMakeLists.txt   |   2 -
 .../Codegen/LLVMGPU/ConvertToLLVM.cpp         |  13 +-
 .../Codegen/LLVMGPU/ConvertToNVVM.cpp         |  15 ++-
 .../Codegen/LLVMGPU/ConvertToROCDL.cpp        |  15 ++-
 .../ExtractAddressComputationGPUPass.cpp      |  15 +--
 .../LLVMGPUCastAddressSpaceFunction.cpp       |  15 +--
 .../LLVMGPU/LLVMGPUCastTypeToFitMMA.cpp       |  14 +-
 .../LLVMGPU/LLVMGPUConfigureVectorLayouts.cpp |  15 +--
 .../LLVMGPU/LLVMGPULowerExecutableTarget.cpp  |  20 ++-
 .../LLVMGPU/LLVMGPUPackSharedMemoryAlloc.cpp  |  10 +-
 .../Codegen/LLVMGPU/LLVMGPUPrefetching.cpp    |  13 +-
 .../LLVMGPU/LLVMGPUPromoteMatmulToFitMMA.cpp  |  11 +-
 .../LLVMGPU/LLVMGPUSelectLoweringStrategy.cpp |  20 ++-
 .../LLVMGPUTensorCoreVectorization.cpp        |  15 ++-
 .../Codegen/LLVMGPU/LLVMGPUTensorPad.cpp      |  14 +-
 .../LLVMGPU/LLVMGPUTileAndDistribute.cpp      |  15 ++-
 .../LLVMGPU/LLVMGPUVectorDistribute.cpp       |  15 +--
 .../Codegen/LLVMGPU/LLVMGPUVectorLowering.cpp |  15 +--
 .../Codegen/LLVMGPU/LLVMGPUVectorToGPU.cpp    |  13 +-
 .../compiler/Codegen/LLVMGPU/PassDetail.h     |  26 ----
 .../iree/compiler/Codegen/LLVMGPU/Passes.cpp  |  25 ++--
 .../iree/compiler/Codegen/LLVMGPU/Passes.h    | 121 +++---------------
 .../iree/compiler/Codegen/LLVMGPU/Passes.td   |  82 ++++++------
 .../LLVMGPU/ROCDLLowerExecutableTarget.cpp    |  16 +--
 .../Codegen/LLVMGPU/ROCDLPassDetail.h         |  21 ---
 .../compiler/Codegen/LLVMGPU/ROCDLPasses.h    |  16 +--
 .../compiler/Codegen/LLVMGPU/ROCDLPasses.td   |   8 +-
 .../LLVMGPU/ROCDLSelectLoweringStrategy.cpp   |  15 +--
 30 files changed, 224 insertions(+), 388 deletions(-)
 delete mode 100644 compiler/src/iree/compiler/Codegen/LLVMGPU/PassDetail.h
 delete mode 100644 compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPassDetail.h

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/AMDGPUChainedMatmulPass.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/AMDGPUChainedMatmulPass.cpp
index d22bd206b053..6a1646e5c4ac 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/AMDGPUChainedMatmulPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/AMDGPUChainedMatmulPass.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include <numeric>
-#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
+
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "iree/compiler/Codegen/Utils/VectorOpUtils.h"
 #include "mlir/Analysis/SliceAnalysis.h"
@@ -13,6 +13,9 @@
 
 namespace mlir::iree_compiler {
 
+#define GEN_PASS_DEF_AMDGPUPREPAREFORCHAINEDMATMULPASS
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
 using VectorValue = TypedValue<VectorType>;
 
 namespace {
@@ -59,8 +62,8 @@ namespace {
 ///   C = A @ B --> C.T = B.T @ A.T
 /// is only defined on standard "@" function, it may be a different
 /// transformation for other indexing maps.
-struct AMDGPUPrepareForChainedMatmulPass
-    : public AMDGPUPrepareForChainedMatmulBase<
+struct AMDGPUPrepareForChainedMatmulPass final
+    : impl::AMDGPUPrepareForChainedMatmulPassBase<
           AMDGPUPrepareForChainedMatmulPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<vector::VectorDialect>();
@@ -255,10 +258,4 @@ struct AMDGPUPrepareForChainedMatmulPass
 };
 
 } // namespace
-
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createAMDGPUPrepareForChainedMatmulPass() {
-  return std::make_unique<AMDGPUPrepareForChainedMatmulPass>();
-}
-
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
index c57e1f585186..9ef45c757d63 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
@@ -32,7 +32,6 @@ iree_gentbl_cc_library(
 iree_compiler_cc_library(
     name = "PassHeaders",
     hdrs = [
-        "PassDetail.h",
         "Passes.h",
         "Passes.h.inc",
     ],
@@ -69,7 +68,6 @@ iree_gentbl_cc_library(
 iree_compiler_cc_library(
     name = "ROCDLPassHeaders",
     hdrs = [
-        "ROCDLPassDetail.h",
         "ROCDLPasses.h",
         "ROCDLPasses.h.inc",
     ],
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
index 30e722e39307..a5d3b0844462 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
@@ -23,7 +23,6 @@ iree_cc_library(
   NAME
     PassHeaders
   HDRS
-    "PassDetail.h"
     "Passes.h"
     "Passes.h.inc"
   DEPS
@@ -52,7 +51,6 @@ iree_cc_library(
   NAME
     ROCDLPassHeaders
   HDRS
-    "ROCDLPassDetail.h"
     "ROCDLPasses.h"
     "ROCDLPasses.h.inc"
   DEPS
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp
index 821623ddf609..a1112454fcb9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp
@@ -6,7 +6,6 @@
 
 #include "iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.h"
 
-#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
@@ -25,6 +24,9 @@
 
 namespace mlir::iree_compiler {
 
+#define GEN_PASS_DEF_TESTLLVMGPUSCALARIZEMATHOPPASS
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
 void ConvertToDynamicSharedMemory(ModuleOp moduleOp) {
   SymbolTableCollection symbolTableCollection;
   // Collect all the adressOfOps to static shared memory globals.
@@ -183,8 +185,9 @@ struct ConvertSharedMemAllocOp : public OpRewritePattern<memref::AllocOp> {
 
 /// Pass to test in dialect transformation used to legalize the IR before
 /// convertToNVVM/ConvertToROCDL.
-class TestLLVMGPULegalizeOpPass
-    : public TestLLVMGPUScalarizeMathOpBase<TestLLVMGPULegalizeOpPass> {
+class TestLLVMGPULegalizeOpPass final
+    : public impl::TestLLVMGPUScalarizeMathOpPassBase<
+          TestLLVMGPULegalizeOpPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<vector::VectorDialect>();
   }
@@ -542,10 +545,6 @@ void populateLowerHALInterfaceOp(RewritePatternSet &patterns) {
       patterns.getContext());
 }
 
-std::unique_ptr<OperationPass<ModuleOp>> createTestLLVMGPULegalizePass() {
-  return std::make_unique<TestLLVMGPULegalizeOpPass>();
-}
-
 static IntegerAttr wrapNumericMemorySpace(MLIRContext *ctx, unsigned space) {
   return IntegerAttr::get(IntegerType::get(ctx, 64), space);
 }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp
index edcb2bb59cfb..b1ac58b1b62f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToNVVM.cpp
@@ -8,7 +8,6 @@
 #include "iree/compiler/Codegen/Common/Transforms.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
 #include "iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.h"
-#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
@@ -24,6 +23,7 @@
 #include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h"
 #include "mlir/Conversion/NVVMToLLVM/NVVMToLLVM.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
@@ -35,6 +35,9 @@
 
 namespace mlir::iree_compiler {
 
+#define GEN_PASS_DEF_CONVERTTONVVMPASS
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
 namespace {
 
 /// A pass that replaces all occurrences of GPU device operations with their
@@ -42,7 +45,10 @@ namespace {
 ///
 /// This pass only handles device code and is not meant to be run on GPU host
 /// code.
-struct ConvertToNVVMPass : public ConvertToNVVMBase<ConvertToNVVMPass> {
+struct ConvertToNVVMPass final
+    : impl::ConvertToNVVMPassBase<ConvertToNVVMPass> {
+  using impl::ConvertToNVVMPassBase<ConvertToNVVMPass>::ConvertToNVVMPassBase;
+
   void getDependentDialects(DialectRegistry &registry) const override {
     registry
         .insert<gpu::GPUDialect, IREE::GPU::IREEGPUDialect, LLVM::LLVMDialect,
@@ -180,9 +186,4 @@ struct ConvertToNVVMPass : public ConvertToNVVMBase<ConvertToNVVMPass> {
 };
 
 } // namespace
-
-std::unique_ptr<OperationPass<ModuleOp>> createConvertToNVVMPass() {
-  return std::make_unique<ConvertToNVVMPass>();
-}
-
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
index d66156ac4d24..215a4bb8a237 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
@@ -8,7 +8,6 @@
 #include "iree/compiler/Codegen/Common/Transforms.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
 #include "iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.h"
-#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
@@ -40,6 +39,9 @@
 
 namespace mlir::iree_compiler {
 
+#define GEN_PASS_DEF_CONVERTTOROCDLPASS
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
 static llvm::cl::opt<int>
     clROCMIndexingBits("iree-rocm-index-bits",
                        llvm::cl::desc("Set the bit width of indices in ROCm."),
@@ -75,7 +77,11 @@ static void populateConvertGPUToAMDGPUPatterns(RewritePatternSet &patterns) {
 ///
 /// This pass only handles device code and is not meant to be run on GPU host
 /// code.
-struct ConvertToROCDLPass : public ConvertToROCDLBase<ConvertToROCDLPass> {
+struct ConvertToROCDLPass final
+    : impl::ConvertToROCDLPassBase<ConvertToROCDLPass> {
+  using impl::ConvertToROCDLPassBase<
+      ConvertToROCDLPass>::ConvertToROCDLPassBase;
+
   void getDependentDialects(DialectRegistry &registry) const override {
     registry
         .insert<IREE::GPU::IREEGPUDialect, LLVM::LLVMDialect,
@@ -203,9 +209,4 @@ struct ConvertToROCDLPass : public ConvertToROCDLBase<ConvertToROCDLPass> {
     LDBG("After converting to dynamic shared memory\n" << m);
   }
 };
-
-std::unique_ptr<OperationPass<ModuleOp>> createConvertToROCDLPass() {
-  return std::make_unique<ConvertToROCDLPass>();
-}
-
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ExtractAddressComputationGPUPass.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ExtractAddressComputationGPUPass.cpp
index 6966f4127522..359aaff24d00 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ExtractAddressComputationGPUPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ExtractAddressComputationGPUPass.cpp
@@ -5,20 +5,22 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree/compiler/Codegen/Common/ExtractAddressComputation.h"
-#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "llvm/Support/Debug.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 #define DEBUG_TYPE "extract-address-computation-gpu"
 
-using namespace mlir;
-
 namespace mlir::iree_compiler {
 
+#define GEN_PASS_DEF_EXTRACTADDRESSCOMPUTATIONGPUPASS
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
 //===----------------------------------------------------------------------===//
 // Helper functions for the `load base[off0...]`
 //  => `load (subview base[off0...])[0...]` pattern.
@@ -80,8 +82,8 @@ populateExtractAddressComputationGPUPatterns(RewritePatternSet &patterns) {
 // Pass registration
 //===----------------------------------------------------------------------===//
 namespace {
-struct ExtractAddressComputationGPUPass
-    : public ExtractAddressComputationGPUBase<
+struct ExtractAddressComputationGPUPass final
+    : impl::ExtractAddressComputationGPUPassBase<
           ExtractAddressComputationGPUPass> {
   void runOnOperation() override;
 };
@@ -96,7 +98,4 @@ void ExtractAddressComputationGPUPass::runOnOperation() {
   }
 }
 
-std::unique_ptr<Pass> createExtractAddressComputationGPUPass() {
-  return std::make_unique<ExtractAddressComputationGPUPass>();
-}
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCastAddressSpaceFunction.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCastAddressSpaceFunction.cpp
index 624361fadec8..aad0618e54b4 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCastAddressSpaceFunction.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCastAddressSpaceFunction.cpp
@@ -4,7 +4,6 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
@@ -12,15 +11,19 @@
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Pass/Pass.h"
 
 #define DEBUG_TYPE "iree-llvmgpu-cast-address-space-function"
 
 namespace mlir::iree_compiler {
 
+#define GEN_PASS_DEF_LLVMGPUCASTADDRESSSPACEFUNCTIONPASS
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
 namespace {
 
-struct LLVMGPUCastAddressSpaceFunctionPass
-    : public LLVMGPUCastAddressSpaceFunctionBase<
+struct LLVMGPUCastAddressSpaceFunctionPass final
+    : impl::LLVMGPUCastAddressSpaceFunctionPassBase<
           LLVMGPUCastAddressSpaceFunctionPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<affine::AffineDialect, gpu::GPUDialect>();
@@ -75,10 +78,4 @@ struct LLVMGPUCastAddressSpaceFunctionPass
 };
 
 } // namespace
-
-std::unique_ptr<OperationPass<ModuleOp>>
-createLLVMGPUCastAddressSpaceFunction() {
-  return std::make_unique<LLVMGPUCastAddressSpaceFunctionPass>();
-}
-
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCastTypeToFitMMA.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCastTypeToFitMMA.cpp
index 621430b7e064..013745ef072e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCastTypeToFitMMA.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUCastTypeToFitMMA.cpp
@@ -6,7 +6,6 @@
 
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUInterfaces.h"
-#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "iree/compiler/Codegen/Utils/VectorOpUtils.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -21,6 +20,9 @@
 
 namespace mlir::iree_compiler {
 
+#define GEN_PASS_DEF_LLVMGPUCASTTYPETOFITMMAPASS
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
 namespace {
 
 struct UpcastContractOutput final : OpRewritePattern<vector::ContractionOp> {
@@ -72,9 +74,8 @@ struct UpcastContractOutput final : OpRewritePattern<vector::ContractionOp> {
   }
 };
 
-struct LLVMGPUCastTypeToFitMMAPass
-    : public LLVMGPUCastTypeToFitMMABase<LLVMGPUCastTypeToFitMMAPass> {
-public:
+struct LLVMGPUCastTypeToFitMMAPass final
+    : impl::LLVMGPUCastTypeToFitMMAPassBase<LLVMGPUCastTypeToFitMMAPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<vector::VectorDialect>();
     registry.insert<arith::ArithDialect>();
@@ -114,9 +115,4 @@ struct LLVMGPUCastTypeToFitMMAPass
   }
 };
 } // namespace
-std::unique_ptr<InterfacePass<FunctionOpInterface>>
-createLLVMGPUCastTypeToFitMMAPass() {
-  return std::make_unique<LLVMGPUCastTypeToFitMMAPass>();
-}
-
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureVectorLayouts.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureVectorLayouts.cpp
index dee5e89b086b..b37d25af33e6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureVectorLayouts.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureVectorLayouts.cpp
@@ -9,7 +9,6 @@
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h"
-#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "llvm/ADT/SetVector.h"
@@ -27,6 +26,9 @@
 
 namespace mlir::iree_compiler {
 
+#define GEN_PASS_DEF_LLVMGPUCONFIGUREVECTORLAYOUTSPASS
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
 namespace {
 
 // Sets an anchoring layout for the given contraction op. Looks for a
@@ -279,10 +281,9 @@ LogicalResult setTransferReadAnchor(ArrayRef<int64_t> workgroupSize,
   return success();
 }
 
-struct LLVMGPUConfigureVectorLayoutsPass
-    : public LLVMGPUConfigureVectorLayoutsBase<
+struct LLVMGPUConfigureVectorLayoutsPass final
+    : impl::LLVMGPUConfigureVectorLayoutsPassBase<
           LLVMGPUConfigureVectorLayoutsPass> {
-public:
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<IREE::VectorExt::IREEVectorExtDialect>();
     registry.insert<vector::VectorDialect>();
@@ -360,10 +361,4 @@ struct LLVMGPUConfigureVectorLayoutsPass
   }
 };
 } // namespace
-
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createLLVMGPUConfigureVectorLayouts() {
-  return std::make_unique<LLVMGPUConfigureVectorLayoutsPass>();
-}
-
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
index 2b72c8ab40b5..dc96c92b0c50 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
@@ -10,7 +10,6 @@
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
 #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h"
 #include "iree/compiler/Codegen/LLVMGPU/KernelConfig.h"
-#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
@@ -39,6 +38,9 @@
 
 namespace mlir::iree_compiler {
 
+#define GEN_PASS_DEF_LLVMGPULOWEREXECUTABLETARGETPASS
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
 namespace {
 /// Lowers an hal.executable.variant operation to scalar/native-vector
 /// code. Invokes different compilation pipeline to
@@ -46,10 +48,13 @@ namespace {
 /// - then convert to NVVM/ROCDL dialect.
 /// This should be merged with the equivalent pass in LinalgToLLVM. Fo
 /// simplicity it is currently a separate pass.
-class LLVMGPULowerExecutableTargetPass
-    : public LLVMGPULowerExecutableTargetBase<
+class LLVMGPULowerExecutableTargetPass final
+    : public impl::LLVMGPULowerExecutableTargetPassBase<
           LLVMGPULowerExecutableTargetPass> {
 public:
+  using impl::LLVMGPULowerExecutableTargetPassBase<
+      LLVMGPULowerExecutableTargetPass>::LLVMGPULowerExecutableTargetPassBase;
+
   void getDependentDialects(DialectRegistry &registry) const override {
     // clang-format off
     registry
@@ -69,10 +74,6 @@ class LLVMGPULowerExecutableTargetPass
     // clang-format on
   }
 
-  LLVMGPULowerExecutableTargetPass() = default;
-  LLVMGPULowerExecutableTargetPass(
-      const LLVMGPULowerExecutableTargetPass &pass) {}
-
   void runOnOperation() override;
 };
 
@@ -217,9 +218,4 @@ void LLVMGPULowerExecutableTargetPass::runOnOperation() {
   }
 }
 
-std::unique_ptr<InterfacePass<FunctionOpInterface>>
-createLLVMGPULowerExecutableTargetPass() {
-  return std::make_unique<LLVMGPULowerExecutableTargetPass>();
-}
-
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPackSharedMemoryAlloc.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPackSharedMemoryAlloc.cpp
index 8aaf2731a41a..a3db9be2049f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPackSharedMemoryAlloc.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPackSharedMemoryAlloc.cpp
@@ -4,19 +4,19 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <algorithm>
-
-#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h"
 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
 
 namespace mlir::iree_compiler {
 
+#define GEN_PASS_DEF_LLVMGPUPACKSHAREDMEMORYALLOCPASS
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
 namespace {
 
-struct LLVMGPUPackSharedMemoryAllocPass
-    : public LLVMGPUPackSharedMemoryAllocBase<
+struct LLVMGPUPackSharedMemoryAllocPass final
+    : impl::LLVMGPUPackSharedMemoryAllocPassBase<
           LLVMGPUPackSharedMemoryAllocPass> {
 public:
   void getDependentDialects(DialectRegistry &registry) const override {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPrefetching.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPrefetching.cpp
index 3f904be7752a..e131252c9307 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPrefetching.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPrefetching.cpp
@@ -4,7 +4,6 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h"
 #include "llvm/ADT/SmallVector.h"
@@ -15,10 +14,14 @@
 
 namespace mlir::iree_compiler {
 
+#define GEN_PASS_DEF_LLVMGPUPREFETCHSHAREDMEMORYPASS
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
 namespace {
 
 struct LLVMGPUPrefetchSharedMemoryPass final
-    : LLVMGPUPrefetchSharedMemoryBase<LLVMGPUPrefetchSharedMemoryPass> {
+    : impl::LLVMGPUPrefetchSharedMemoryPassBase<
+          LLVMGPUPrefetchSharedMemoryPass> {
   void runOnOperation() override {
     FunctionOpInterface funcOp = getOperation();
     IRRewriter rewriter(funcOp.getContext());
@@ -37,10 +40,4 @@ struct LLVMGPUPrefetchSharedMemoryPass final
 };
 
 } // namespace
-
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createLLVMGPUPrefetchSharedMemoryPass() {
-  return std::make_unique<LLVMGPUPrefetchSharedMemoryPass>();
-}
-
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPromoteMatmulToFitMMA.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPromoteMatmulToFitMMA.cpp
index 5e61991070df..26b96d7ad91b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPromoteMatmulToFitMMA.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPromoteMatmulToFitMMA.cpp
@@ -4,7 +4,6 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -16,14 +15,18 @@
 #define DEBUG_TYPE "iree-llvmgpu-promote-matmul-to-fit-mma"
 
 namespace mlir::iree_compiler {
-#define GEN_PASS_DECL_LLVMGPUPROMOTEMATMULTOFITMMA
+
+#define GEN_PASS_DEF_LLVMGPUPROMOTEMATMULTOFITMMAPASS
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
 namespace {
 
-class LLVMGPUPromoteMatmulToFitMMAPass
-    : public LLVMGPUPromoteMatmulToFitMMABase<
+class LLVMGPUPromoteMatmulToFitMMAPass final
+    : public impl::LLVMGPUPromoteMatmulToFitMMAPassBase<
           LLVMGPUPromoteMatmulToFitMMAPass> {
 public:
+  using impl::LLVMGPUPromoteMatmulToFitMMAPassBase<
+      LLVMGPUPromoteMatmulToFitMMAPass>::LLVMGPUPromoteMatmulToFitMMAPassBase;
   explicit LLVMGPUPromoteMatmulToFitMMAPass(
       const LLVMGPUMatmulPadOption &option) {
     this->targetDimensions.setValue(option);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUSelectLoweringStrategy.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUSelectLoweringStrategy.cpp
index b41a5deef11a..a6d630717bb6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUSelectLoweringStrategy.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUSelectLoweringStrategy.cpp
@@ -7,7 +7,6 @@
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
 #include "iree/compiler/Codegen/LLVMGPU/KernelConfig.h"
-#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
@@ -28,13 +27,19 @@
 
 namespace mlir::iree_compiler {
 
+#define GEN_PASS_DEF_LLVMGPUSELECTLOWERINGSTRATEGYPASS
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
 namespace {
 /// Selects a lowering strategy for taking a hal.executable.variant operation
 /// to scalar/native-vector code.
-class LLVMGPUSelectLoweringStrategyPass
-    : public LLVMGPUSelectLoweringStrategyBase<
+class LLVMGPUSelectLoweringStrategyPass final
+    : public impl::LLVMGPUSelectLoweringStrategyPassBase<
           LLVMGPUSelectLoweringStrategyPass> {
 public:
+  using impl::LLVMGPUSelectLoweringStrategyPassBase<
+      LLVMGPUSelectLoweringStrategyPass>::LLVMGPUSelectLoweringStrategyPassBase;
+
   void getDependentDialects(DialectRegistry &registry) const override {
     // TODO(qedawkins): Once TransformStrategies is deprecated, drop the
     // unnecessary dialect registrations.
@@ -56,10 +61,6 @@ class LLVMGPUSelectLoweringStrategyPass
     // clang-format on
   }
 
-  LLVMGPUSelectLoweringStrategyPass() = default;
-  LLVMGPUSelectLoweringStrategyPass(
-      const LLVMGPUSelectLoweringStrategyPass &pass) {}
-
   void runOnOperation() override;
 };
 } // namespace
@@ -116,9 +117,4 @@ void LLVMGPUSelectLoweringStrategyPass::runOnOperation() {
   }
 }
 
-std::unique_ptr<OperationPass<ModuleOp>>
-createLLVMGPUSelectLoweringStrategyPass() {
-  return std::make_unique<LLVMGPUSelectLoweringStrategyPass>();
-}
-
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
index 3ed9b705bbd7..aa72280e5fcc 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorCoreVectorization.cpp
@@ -6,7 +6,6 @@
 
 #include "iree/compiler/Codegen/Common/GPU/GPUPatterns.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
-#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/MarkerUtils.h"
@@ -26,6 +25,9 @@
 
 namespace mlir::iree_compiler {
 
+#define GEN_PASS_DEF_LLVMGPUTENSORCOREVECTORIZATIONPASS
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
 //====---------------------------------------------------------------------===//
 // Patterns for vectorization
 //====---------------------------------------------------------------------===//
@@ -66,11 +68,16 @@ static void populateVectorUnrollPatterns(RewritePatternSet &patterns,
 }
 
 namespace {
-struct LLVMGPUTensorCoreVectorizationPass
-    : public LLVMGPUTensorCoreVectorizationBase<
+class LLVMGPUTensorCoreVectorizationPass final
+    : public impl::LLVMGPUTensorCoreVectorizationPassBase<
           LLVMGPUTensorCoreVectorizationPass> {
-  LLVMGPUTensorCoreVectorizationPass(GPUTensorCoreType tensorCoreType)
+public:
+  using impl::LLVMGPUTensorCoreVectorizationPassBase<
+      LLVMGPUTensorCoreVectorizationPass>::
+      LLVMGPUTensorCoreVectorizationPassBase;
+  explicit LLVMGPUTensorCoreVectorizationPass(GPUTensorCoreType tensorCoreType)
       : tensorCoreType(tensorCoreType) {}
+
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<vector::VectorDialect>();
   }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorPad.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorPad.cpp
index cc49c958b14c..1a4ffe4964d4 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorPad.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTensorPad.cpp
@@ -4,7 +4,6 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/LinalgOpInfo.h"
@@ -21,6 +20,9 @@
 
 namespace mlir::iree_compiler {
 
+#define GEN_PASS_DEF_LLVMGPUTENSORPADPASS
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
 namespace {
 
 static FailureOr<SmallVector<int64_t>>
@@ -108,8 +110,8 @@ static bool hasTwoOrThreeLoopsInfo(linalg::LinalgOp linalgOp) {
          linalgOp.getNumParallelLoops() <= 3;
 }
 
-struct LLVMGPUTensorPadPass
-    : public LLVMGPUTensorPadBase<LLVMGPUTensorPadPass> {
+struct LLVMGPUTensorPadPass final
+    : impl::LLVMGPUTensorPadPassBase<LLVMGPUTensorPadPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<bufferization::BufferizationDialect>();
   }
@@ -166,10 +168,4 @@ struct LLVMGPUTensorPadPass
   }
 };
 } // namespace
-
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createLLVMGPUTensorPadPass() {
-  return std::make_unique<LLVMGPUTensorPadPass>();
-}
-
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
index bcbe15a4145c..ac3cec65fa55 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUTileAndDistribute.cpp
@@ -7,7 +7,6 @@
 #include "iree/compiler/Codegen/Common/GPU/GPUPatterns.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h"
-#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "iree/compiler/Codegen/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
@@ -15,6 +14,7 @@
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
@@ -25,6 +25,9 @@
 
 namespace mlir::iree_compiler {
 
+#define GEN_PASS_DEF_LLVMGPUTILEANDDISTRIBUTEPASS
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
 /// Tiles to workgroup level. Workgroup tiling is done at the flow level but we
 /// may have extra tiling for the reduction dimension. Therefore we tile again
 /// without distributing.
@@ -192,15 +195,19 @@ static LogicalResult tileToInvocation(mlir::FunctionOpInterface funcOp,
 }
 
 namespace {
-struct LLVMGPUTileAndDistributePass
-    : public LLVMGPUTileAndDistributeBase<LLVMGPUTileAndDistributePass> {
+class LLVMGPUTileAndDistributePass final
+    : public impl::LLVMGPUTileAndDistributePassBase<
+          LLVMGPUTileAndDistributePass> {
 private:
   // Distribute the workloads to warp if true otherwise distribute to threads.
   bool distributeToWarp = false;
 
 public:
+  using impl::LLVMGPUTileAndDistributePassBase<
+      LLVMGPUTileAndDistributePass>::LLVMGPUTileAndDistributePassBase;
   LLVMGPUTileAndDistributePass(bool distributeToWarp)
       : distributeToWarp(distributeToWarp) {}
+
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<affine::AffineDialect, gpu::GPUDialect>();
   }
@@ -304,7 +311,7 @@ struct LLVMGPUTileAndDistributePass
 } // namespace
 
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createLLVMGPUTileAndDistribute(bool distributeToWarp) {
+createLLVMGPUTileAndDistributePass(bool distributeToWarp) {
   return std::make_unique<LLVMGPUTileAndDistributePass>(distributeToWarp);
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp
index e4a9a81708ca..466d7bd1bf80 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp
@@ -8,7 +8,6 @@
 #include "iree/compiler/Codegen/Common/GPU/GPUVectorDistribution.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h"
-#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
@@ -24,6 +23,9 @@
 
 namespace mlir::iree_compiler {
 
+#define GEN_PASS_DEF_LLVMGPUVECTORDISTRIBUTEPASS
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
 namespace {
 
 class ContractionVectorLayoutOptions : public VectorLayoutOptions {
@@ -44,9 +46,8 @@ class ContractionVectorLayoutOptions : public VectorLayoutOptions {
   RewritePatternSet patterns;
 };
 
-struct LLVMGPUVectorDistributePass
-    : public LLVMGPUVectorDistributeBase<LLVMGPUVectorDistributePass> {
-public:
+struct LLVMGPUVectorDistributePass final
+    : impl::LLVMGPUVectorDistributePassBase<LLVMGPUVectorDistributePass> {
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<IREE::VectorExt::IREEVectorExtDialect>();
     registry.insert<affine::AffineDialect>();
@@ -116,10 +117,4 @@ struct LLVMGPUVectorDistributePass
   }
 };
 } // namespace
-
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createLLVMGPUVectorDistribute() {
-  return std::make_unique<LLVMGPUVectorDistributePass>();
-}
-
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorLowering.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorLowering.cpp
index 689393c2b42f..bcc2d00c69bd 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorLowering.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorLowering.cpp
@@ -4,9 +4,9 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/MemRef/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
@@ -16,13 +16,16 @@
 
 namespace mlir::iree_compiler {
 
+#define GEN_PASS_DEF_LLVMGPUVECTORLOWERINGPASS
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
 //====---------------------------------------------------------------------===//
 // Patterns for late vector op lowering.
 //====---------------------------------------------------------------------===//
 
 namespace {
-struct LLVMGPUVectorLoweringPass
-    : public LLVMGPUVectorLoweringBase<LLVMGPUVectorLoweringPass> {
+struct LLVMGPUVectorLoweringPass final
+    : impl::LLVMGPUVectorLoweringPassBase<LLVMGPUVectorLoweringPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<affine::AffineDialect>();
     registry.insert<memref::MemRefDialect>();
@@ -70,10 +73,4 @@ struct LLVMGPUVectorLoweringPass
   }
 };
 } // namespace
-
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createLLVMGPUVectorLoweringPass() {
-  return std::make_unique<LLVMGPUVectorLoweringPass>();
-}
-
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorToGPU.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorToGPU.cpp
index 4941ee3e8a24..ca631379867c 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorToGPU.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorToGPU.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree/compiler/Codegen/Common/GPU/GPUPatterns.h"
-#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
 #include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
@@ -21,6 +20,9 @@
 
 namespace mlir::iree_compiler {
 
+#define GEN_PASS_DEF_LLVMGPUVECTORTOGPUPASS
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
+
 static void swizzleSharedMemory(mlir::FunctionOpInterface funcOp) {
   SmallVector<memref::AllocOp> shmAllocOps;
   funcOp->walk([&](memref::AllocOp allocOp) {
@@ -38,10 +40,13 @@ static void swizzleSharedMemory(mlir::FunctionOpInterface funcOp) {
 }
 
 namespace {
-struct LLVMGPUVectorToGPUPass
-    : public LLVMGPUVectorToGPUBase<LLVMGPUVectorToGPUPass> {
+struct LLVMGPUVectorToGPUPass final
+    : impl::LLVMGPUVectorToGPUPassBase<LLVMGPUVectorToGPUPass> {
+  using impl::LLVMGPUVectorToGPUPassBase<
+      LLVMGPUVectorToGPUPass>::LLVMGPUVectorToGPUPassBase;
   LLVMGPUVectorToGPUPass(GPUTensorCoreType tensorCoreType)
       : tensorCoreType(tensorCoreType) {}
+
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<gpu::GPUDialect, nvgpu::NVGPUDialect, affine::AffineDialect,
                     memref::MemRefDialect>();
@@ -103,7 +108,7 @@ struct LLVMGPUVectorToGPUPass
 } // namespace
 
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createLLVMGPUVectorToGPU(GPUTensorCoreType tensorCoreType) {
+createLLVMGPUVectorToGPUPass(GPUTensorCoreType tensorCoreType) {
   return std::make_unique<LLVMGPUVectorToGPUPass>(tensorCoreType);
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/PassDetail.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/PassDetail.h
deleted file mode 100644
index e042deb407f0..000000000000
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/PassDetail.h
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright 2023 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_COMPILER_CODEGEN_LLVMGPU_PASS_DETAIL_H_
-#define IREE_COMPILER_CODEGEN_LLVMGPU_PASS_DETAIL_H_
-
-#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
-#include "iree/compiler/Codegen/LLVMGPU/Passes.h"
-#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
-#include "mlir/Interfaces/FunctionInterfaces.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir::iree_compiler {
-
-#define GEN_PASS_CLASSES
-#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
-
-} // namespace mlir::iree_compiler
-
-#endif // IREE_COMPILER_CODEGEN_LLVMGPU_PASS_DETAIL_H_
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index 8b74d1b4a3d4..294e3b81dd57 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -497,7 +497,7 @@ void addGPUMatmulTensorCorePassPipeline(OpPassManager &funcPassManager,
 
   // Distribute linalg onto warps within the workgroup.
   funcPassManager.addPass(
-      createLLVMGPUTileAndDistribute(/*distributeToWarp=*/true));
+      createLLVMGPUTileAndDistributePass(/*distributeToWarp=*/true));
   funcPassManager.addPass(createRemoveSingleIterationLoopPass());
   if (pipelineDepth > 1) {
     funcPassManager.addPass(createGPUMultiBufferingPass(
@@ -518,7 +518,8 @@ void addGPUMatmulTensorCorePassPipeline(OpPassManager &funcPassManager,
   funcPassManager.addPass(createCSEPass());
 
   // Linalg -> vector
-  funcPassManager.addPass(createLLVMGPUTensorCoreVectorizationPass());
+  funcPassManager.addPass(
+      createLLVMGPUTensorCoreVectorizationPass(GPUTensorCoreType::WMMA));
   funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
   funcPassManager.addPass(createCSEPass());
   funcPassManager.addPass(createOptimizeVectorTransferPass());
@@ -537,7 +538,8 @@ void addGPUMatmulTensorCorePassPipeline(OpPassManager &funcPassManager,
   funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
-  funcPassManager.addPass(createLLVMGPUVectorToGPU());
+  funcPassManager.addPass(
+      createLLVMGPUVectorToGPUPass(GPUTensorCoreType::WMMA));
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
 
@@ -551,7 +553,7 @@ void addGPUMatmulTensorCorePassPipeline(OpPassManager &funcPassManager,
       llvm::to_underlying(PipeliningSchedulingStrategy::loadGlobalStage0);
   funcPassManager.addPass(createGPUPipeliningPass(pipelieningOptions));
   // Optimize shared memory usage.
-  funcPassManager.addPass(createLLVMGPUPackSharedMemoryAlloc());
+  funcPassManager.addPass(createLLVMGPUPackSharedMemoryAllocPass());
 }
 
 //===---------------------------------------------------------------------===//
@@ -565,7 +567,7 @@ void addGPUMatmulTensorCoreMmaSyncPassPipeline(
 
   // Distribute linalg onto warps within the workgroup.
   funcPassManager.addPass(
-      createLLVMGPUTileAndDistribute(/*distributeToWarp=*/true));
+      createLLVMGPUTileAndDistributePass(/*distributeToWarp=*/true));
   funcPassManager.addPass(createRemoveSingleIterationLoopPass());
   if (pipelineDepth > 1) {
     funcPassManager.addPass(createGPUMultiBufferingPass(
@@ -604,7 +606,7 @@ void addGPUMatmulTensorCoreMmaSyncPassPipeline(
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
   funcPassManager.addPass(
-      createLLVMGPUVectorToGPU(GPUTensorCoreType::MMA_SYNC));
+      createLLVMGPUVectorToGPUPass(GPUTensorCoreType::MMA_SYNC));
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
 
@@ -618,7 +620,7 @@ void addGPUMatmulTensorCoreMmaSyncPassPipeline(
       llvm::to_underlying(PipeliningSchedulingStrategy::nvidiaTensorCore);
   funcPassManager.addPass(createGPUPipeliningPass(pipelieningOptions));
   // Optimize shared memory usage.
-  funcPassManager.addPass(createLLVMGPUPackSharedMemoryAlloc());
+  funcPassManager.addPass(createLLVMGPUPackSharedMemoryAllocPass());
 }
 
 //===---------------------------------------------------------------------===//
@@ -801,8 +803,8 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
   funcPassManager.addPass(createAMDGPUPrepareForChainedMatmulPass());
 
   // Vector SIMD -> Vector SIMT
-  funcPassManager.addPass(createLLVMGPUConfigureVectorLayouts());
-  funcPassManager.addPass(createLLVMGPUVectorDistribute());
+  funcPassManager.addPass(createLLVMGPUConfigureVectorLayoutsPass());
+  funcPassManager.addPass(createLLVMGPUVectorDistributePass());
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
 
@@ -889,7 +891,8 @@ void addGPUSimpleDistributePassPipeline(OpPassManager &funcPassManager) {
   tileAndBufferize(funcPassManager);
 
   // Distribute linalg onto threads within the workgroup.
-  funcPassManager.addPass(createLLVMGPUTileAndDistribute());
+  funcPassManager.addPass(
+      createLLVMGPUTileAndDistributePass(/*distributeToWarp=*/false));
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
 
@@ -1019,7 +1022,7 @@ static void addLowerToLLVMGPUPasses(OpPassManager &modulePassManager,
   // Strip out the debug info for the kernel.
   modulePassManager.addPass(createStripDebugInfoPass());
   // Cast address spaces of all function arguments to generic.
-  modulePassManager.addPass(createLLVMGPUCastAddressSpaceFunction());
+  modulePassManager.addPass(createLLVMGPUCastAddressSpaceFunctionPass());
 
   if (forROCDL) {
     // convert to ROCDL.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
index 527317cfb253..2804a32f5533 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
@@ -42,9 +42,9 @@ struct LLVMGPUPipelineOptions {
 llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
                               const LLVMGPUPipelineOptions &options);
 
-//===----------------------------------------------------------------------===//
-// Passes
-//===----------------------------------------------------------------------===//
+//----------------------------------------------------------------------------//
+// LLVMGPU backend Pass Pipelines.
+//----------------------------------------------------------------------------//
 
 /// Lowering using SIMT CUDA core operations.
 void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager,
@@ -112,126 +112,43 @@ void buildLLVMGPUCodegenConfigurationPassPipeline(
 void buildLLVMGPUCodegenPassPipeline(OpPassManager &variantPassManagery,
                                      bool useROCM);
 
-/// Performs the final conversion to NNVM+LLVM dialect.
-std::unique_ptr<OperationPass<ModuleOp>> createConvertToNVVMPass();
-
-/// Performs the final conversion to ROCDL+LLVM dialect.
-std::unique_ptr<OperationPass<ModuleOp>> createConvertToROCDLPass();
-
-/// Cast address space to generic in CallOp and FuncOp
-std::unique_ptr<OperationPass<ModuleOp>>
-createLLVMGPUCastAddressSpaceFunction();
-
-/// Perform type extension/truncation over vector.contract types to target GPU
-/// MMA intrinsics.
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createLLVMGPUCastTypeToFitMMAPass();
-
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createLLVMGPUDistribute();
-
-// Extract address computations (including the ones with GPU instructions) into
-// their own separate instructions.
-std::unique_ptr<Pass> createExtractAddressComputationGPUPass();
-
-/// Create pass selecting the lowering strategy for LLVMGPU.
-std::unique_ptr<OperationPass<ModuleOp>>
-createLLVMGPUSelectLoweringStrategyPass();
-
-/// Create pass calling the dynamic pipeline for LLVMGPU.
-std::unique_ptr<InterfacePass<FunctionOpInterface>>
-createLLVMGPULowerExecutableTargetPass();
-
-// Pass to pack shared memory allocations in order to reduce shared memory
-// usage.
-std::unique_ptr<InterfacePass<FunctionOpInterface>>
-createLLVMGPUPackSharedMemoryAlloc();
+/// Lowering calling vectorization patterns.
+LogicalResult
+verifyGPUMatmulPipeline(Operation *op,
+                        IREE::Codegen::LoweringConfigAttr loweringConfig,
+                        IREE::Codegen::TranslationInfoAttr translationInfo,
+                        ArrayRef<int64_t> workgroupSize);
 
-std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createLLVMGPUPrefetchSharedMemoryPass();
+//------------------------------------------------------------------------------
+// Wrappers that not use tablegen options.
+//------------------------------------------------------------------------------
 
-/// Pass to pad operations on tensors in top-down order.
 enum class LLVMGPUMatmulPadOption { ParallelDims, ReductionDims };
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
-createLLVMGPUPromoteMatmulToFitMMAPass(
-    LLVMGPUMatmulPadOption option = LLVMGPUMatmulPadOption::ParallelDims);
-
-// Pass to set layouts for vector distribution.
-std::unique_ptr<InterfacePass<FunctionOpInterface>>
-createLLVMGPUConfigureVectorLayouts();
+createLLVMGPUPromoteMatmulToFitMMAPass(LLVMGPUMatmulPadOption option);
 
 enum class GPUTensorCoreType {
   WMMA = 0,
   MMA_SYNC = 1,
 };
 
-/// Convert Linalg ops to Vector and prepare converstion to GPU MMA ops.
-std::unique_ptr<InterfacePass<FunctionOpInterface>>
-createLLVMGPUTensorCoreVectorizationPass(
-    GPUTensorCoreType tensorCoreType = GPUTensorCoreType::WMMA);
-
-//. Pass to pad out tensors up to static dimensions.
 std::unique_ptr<InterfacePass<FunctionOpInterface>>
-createLLVMGPUTensorPadPass();
 
-/// Perform tiling and distribution to threads.
+createLLVMGPUTensorCoreVectorizationPass(GPUTensorCoreType tensorCoreType);
 std::unique_ptr<InterfacePass<FunctionOpInterface>>
-createLLVMGPUTileAndDistribute(bool distributeToWarp = false);
+createLLVMGPUVectorToGPUPass(GPUTensorCoreType tensorCoreType);
 
-// Pass to distribute vectorized functions.
 std::unique_ptr<InterfacePass<FunctionOpInterface>>
-createLLVMGPUVectorDistribute();
-
-/// Lower vector ops before convertion to LLVM.
-std::unique_ptr<InterfacePass<FunctionOpInterface>>
-createLLVMGPUVectorLoweringPass();
-
-/// Converts vector ops to gpu dialect.
-std::unique_ptr<InterfacePass<FunctionOpInterface>> createLLVMGPUVectorToGPU(
-    GPUTensorCoreType tensorCoreType = GPUTensorCoreType::WMMA);
-
-/// Lowering calling vectorization patterns.
-LogicalResult
-verifyGPUMatmulPipeline(Operation *op,
-                        IREE::Codegen::LoweringConfigAttr loweringConfig,
-                        IREE::Codegen::TranslationInfoAttr translationInfo,
-                        ArrayRef<int64_t> workgroupSize);
-
-/// Given a chain of matmuls with some or no operations
-/// in between, like
-///
-/// d = matmul_transpose_b(a, b) + c
-/// ...
-/// e = matmul_transpose_b(d, f) + g
-///
-/// this pattern transforms the above IR to
-///
-/// c.t = transpose c
-/// d = matmul_transpose_b(b, a) + c.t
-/// d.t = transpose d
-/// ...
-/// g.t = transpose g
-/// e = matmul_transpose_b(f, d.t) + g.t
-/// e.t = transpose e
-///
-/// On CDNA architectures, where the layouts of the RHS and result
-/// are the same and transposed from the LHS layout, this type
-/// of transformation can avoid trips to shared memory/shuffle instructions
-/// on operators like Flash Attention.
-std::unique_ptr<InterfacePass<FunctionOpInterface>>
-createAMDGPUPrepareForChainedMatmulPass();
+createLLVMGPUTileAndDistributePass(bool distributeToWarp);
 
 //----------------------------------------------------------------------------//
 // Register LLVMGPU Passes
 //----------------------------------------------------------------------------//
 
-void registerCodegenLLVMGPUPasses();
+#define GEN_PASS_DECL
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc" // IWYU pragma: keep
 
-//------------------------------------------------------------------------------
-// Test passes
-//------------------------------------------------------------------------------
-
-std::unique_ptr<OperationPass<ModuleOp>> createTestLLVMGPULegalizePass();
+void registerCodegenLLVMGPUPasses();
 
 } // namespace mlir::iree_compiler
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td
index 8ea7da497989..33df5d26beba 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td
@@ -13,81 +13,93 @@ include "mlir/Pass/PassBase.td"
 // LLVMGPU Passes (keep alphabetical)
 //------------------------------------------------------------------------------
 
-def AMDGPUPrepareForChainedMatmul :
+def AMDGPUPrepareForChainedMatmulPass :
     InterfacePass<"iree-amdgpu-prepare-chained-matmul", "mlir::FunctionOpInterface"> {
   let summary = "Pass to swap operands and transpose accumulator and result";
-  let constructor = "mlir::iree_compiler::createAMDGPUPrepareForChainedMatmulPass()";
+  let description = [{
+    Given a chain of matmuls with some or no operations
+    in between, like
+
+    d = matmul_transpose_b(a, b) + c
+    ...
+    e = matmul_transpose_b(d, f) + g
+
+    this pattern transforms the above IR to
+
+    c.t = transpose c
+    d = matmul_transpose_b(b, a) + c.t
+    d.t = transpose d
+    ...
+    g.t = transpose g
+    e = matmul_transpose_b(f, d.t) + g.t
+    e.t = transpose e
+
+    On CDNA architectures, where the layouts of the RHS and result
+    are the same and transposed from the LHS layout, this type
+    of transformation can avoid trips to shared memory/shuffle instructions
+    on operators like Flash Attention.
+  }];
 }
 
 // TODO: Bring the argument in line with the names used elsewhere.
-def ConvertToNVVM :
+def ConvertToNVVMPass :
     Pass<"iree-convert-to-nvvm", "ModuleOp"> {
   let summary = "Perform final conversion from builtin/GPU/HAL/standard dialect to LLVM "
     "and NVVM dialects";
-  let constructor = "mlir::iree_compiler::createConvertToNVVMPass()";
 }
 
 // TODO: Bring the argument in line with the names used elsewhere.
-def ConvertToROCDL :
+def ConvertToROCDLPass :
     Pass<"iree-convert-to-rocdl", "ModuleOp"> {
   let summary = "Perform final conversion from builtin/GPU/HAL/standard dialect to LLVM "
     "and ROCDL dialects";
-  let constructor = "mlir::iree_compiler::createConvertToROCDLPass()";
 }
 
-def ExtractAddressComputationGPU: Pass<"extract-address-computation-gpu"> {
+def ExtractAddressComputationGPUPass: Pass<"extract-address-computation-gpu"> {
   let summary = "Extract address computations from memory accesses";
   let description = [{
      This pass is similar to `extract-address-computation` except it also
      supports memory accesses that are specific to GPUs.
   }];
-  let constructor = "mlir::iree_compiler::createExtractAddressComputationGPUPass()";
   let dependentDialects = [
       "memref::MemRefDialect", "nvgpu::NVGPUDialect", "affine::AffineDialect"
   ];
 }
 
-def LLVMGPUCastAddressSpaceFunction :
+def LLVMGPUCastAddressSpaceFunctionPass :
     Pass<"iree-llvmgpu-cast-address-space-function", "ModuleOp"> {
   let summary = "Cast address space to generic in CallOp and FuncOp";
-  let constructor = "mlir::iree_compiler::createLLVMGPUCastAddressSpaceFunction()";
 }
 
-def LLVMGPUCastTypeToFitMMA : InterfacePass<"iree-llvmgpu-cast-type-to-fit-mma",
-                                            "mlir::FunctionOpInterface"> {
+def LLVMGPUCastTypeToFitMMAPass : InterfacePass<"iree-llvmgpu-cast-type-to-fit-mma",
+                                                "mlir::FunctionOpInterface"> {
   let summary = "Perform type extension/truncation over vector.contract types "
                 "to target GPU MMA intrinsics";
-  let constructor = "mlir::iree_compiler::createLLVMGPUCastTypeToFitMMAPass()";
 }
 
-def LLVMGPUConfigureVectorLayouts :
+def LLVMGPUConfigureVectorLayoutsPass :
     InterfacePass<"iree-llvmgpu-configure-vector-layouts", "mlir::FunctionOpInterface"> {
   let summary = "Pass to set layouts for vector distribution";
-  let constructor = "mlir::iree_compiler::createLLVMGPUConfigureVectorLayouts()";
 }
 
-def LLVMGPULowerExecutableTarget :
+def LLVMGPULowerExecutableTargetPass :
     InterfacePass<"iree-llvmgpu-lower-executable-target", "mlir::FunctionOpInterface"> {
   let summary = "Perform lowering of executable target using one of the IREE::HAL::DispatchLoweringPassPipeline";
-  let constructor = "mlir::iree_compiler::createLLVMGPULowerExecutableTargetPass()";
 }
 
-def LLVMGPUPackSharedMemoryAlloc :
+def LLVMGPUPackSharedMemoryAllocPass :
     InterfacePass<"iree-llvmgpu-pack-shared-memory-alloc", "mlir::FunctionOpInterface"> {
   let summary = "Pass pack shared memory allocation in order to reduce memory usage.";
-  let constructor = "mlir::iree_compiler::createLLVMGPUPackSharedMemoryAlloc()";
 }
 
-def LLVMGPUPrefetchSharedMemory :
+def LLVMGPUPrefetchSharedMemoryPass :
     InterfacePass<"iree-llvmgpu-prefetch-shared-memory", "mlir::FunctionOpInterface"> {
   let summary = "Rotate scf.for loops to prefetch shared memory with distance 1";
-  let constructor = "mlir::iree_compiler::createLLVMGPUPrefetchSharedMemoryPass()";
 }
 
-def LLVMGPUPromoteMatmulToFitMMA :
+def LLVMGPUPromoteMatmulToFitMMAPass :
     InterfacePass<"iree-llvmgpu-promote-matmul-to-fit-mma", "mlir::FunctionOpInterface"> {
   let summary = "Pass to promote contraction ops to fit mma shapes";
-  let constructor = "mlir::iree_compiler::createLLVMGPUPromoteMatmulToFitMMAPass()";
   let options = [
     Option<"targetDimensions", "target-dimensions", "mlir::iree_compiler::LLVMGPUMatmulPadOption",
            /*default=*/"mlir::iree_compiler::LLVMGPUMatmulPadOption::ParallelDims",
@@ -103,56 +115,48 @@ def LLVMGPUPromoteMatmulToFitMMA :
   ];
 }
 
-def LLVMGPUSelectLoweringStrategy :
+def LLVMGPUSelectLoweringStrategyPass :
     Pass<"iree-llvmgpu-select-lowering-strategy", "ModuleOp"> {
   let summary = "Select a IREE::HAL::DispatchLoweringPassPipeline for lowering the target variant";
-  let constructor = "mlir::iree_compiler::createLLVMGPUSelectLoweringStrategyPass()";
 }
 
-def LLVMGPUTensorCoreVectorization :
+def LLVMGPUTensorCoreVectorizationPass :
     InterfacePass<"iree-llvmgpu-tensorcore-vectorization", "mlir::FunctionOpInterface"> {
   let summary = "Pass to convert linalg into Vector and transform it to a form that can be lowered to GPU MMA ops";
-  let constructor = "mlir::iree_compiler::createLLVMGPUTensorCoreVectorizationPass()";
 }
 
-def LLVMGPUTensorPad :
+def LLVMGPUTensorPadPass :
     InterfacePass<"iree-llvmgpu-tensor-pad", "mlir::FunctionOpInterface"> {
   let summary = "Pass to pad out tensors up to static dimensions.";
-  let constructor = "mlir::iree_compiler::createLLVMGPUTensorPadPass()";
 }
 
-def LLVMGPUTileAndDistribute :
+def LLVMGPUTileAndDistributePass :
     InterfacePass<"iree-llvmgpu-tile-and-distribute", "mlir::FunctionOpInterface"> {
   let summary = "Pass to tile and distribute linalg ops within a workgroup.";
-  let constructor = "mlir::iree_compiler::createLLVMGPUTileAndDistribute()";
 }
 
-def LLVMGPUVectorDistribute :
+def LLVMGPUVectorDistributePass :
     InterfacePass<"iree-llvmgpu-vector-distribute", "mlir::FunctionOpInterface"> {
   let summary = "Pass to distribute vectorized functions.";
-  let constructor = "mlir::iree_compiler::createLLVMGPUVectorDistribute()";
 }
 
-def LLVMGPUVectorLowering :
+def LLVMGPUVectorLoweringPass :
     InterfacePass<"iree-llvmgpu-vector-lowering", "mlir::FunctionOpInterface"> {
   let summary = "Pass to lower Vector ops before conversion to LLVM.";
-  let constructor = "mlir::iree_compiler::createLLVMGPUVectorLoweringPass()";
 }
 
-def LLVMGPUVectorToGPU :
+def LLVMGPUVectorToGPUPass :
     InterfacePass<"iree-llvmgpu-vector-to-gpu", "mlir::FunctionOpInterface"> {
   let summary = "Pass to convert vector to gpu.";
-  let constructor = "mlir::iree_compiler::createLLVMGPUVectorToGPU()";
 }
 
 //------------------------------------------------------------------------------
 // Test Passes
 //------------------------------------------------------------------------------
 
-def TestLLVMGPUScalarizeMathOp :
+def TestLLVMGPUScalarizeMathOpPass :
     Pass<"iree-test-llvmgpu-legalize-ops", "ModuleOp"> {
   let summary = "Test pass for several legalization patterns.";
-  let constructor = "mlir::iree_compiler::createTestLLVMGPULegalizePass()";
 }
 
 #endif // IREE_CODEGEN_LLVMGPU_PASSES
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLLowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLLowerExecutableTarget.cpp
index b16c4ddd9412..e9dfc6a5e9ac 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLLowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLLowerExecutableTarget.cpp
@@ -7,8 +7,6 @@
 #include "iree/compiler/Codegen/Common/PassUtils.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
-#include "iree/compiler/Codegen/LLVMGPU/ROCDLPassDetail.h"
-#include "iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h"
 #include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h"
@@ -24,13 +22,17 @@
 
 namespace mlir::iree_compiler {
 
+#define GEN_PASS_DEF_ROCDLLOWEREXECUTABLETARGETPASS
+#include "iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h.inc"
+
 namespace {
 using CodeGenPipeline = IREE::Codegen::DispatchLoweringPassPipeline;
 
 /// Lowers an IREE hal.executable.variant operation using a suitable pass
 /// pipeline.
-class ROCDLLowerExecutableTargetPass
-    : public ROCDLLowerExecutableTargetBase<ROCDLLowerExecutableTargetPass> {
+class ROCDLLowerExecutableTargetPass final
+    : public impl::ROCDLLowerExecutableTargetPassBase<
+          ROCDLLowerExecutableTargetPass> {
 public:
   void getDependentDialects(DialectRegistry &registry) const override {
     registry
@@ -82,10 +84,4 @@ class ROCDLLowerExecutableTargetPass
   }
 };
 } // namespace
-
-std::unique_ptr<InterfacePass<FunctionOpInterface>>
-createROCDLLowerExecutableTargetPass() {
-  return std::make_unique<ROCDLLowerExecutableTargetPass>();
-}
-
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPassDetail.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPassDetail.h
deleted file mode 100644
index 067cfead96ed..000000000000
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPassDetail.h
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2024 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#ifndef IREE_COMPILER_CODEGEN_LLVMGPU_ROCDLPASSDETAIL_H_
-#define IREE_COMPILER_CODEGEN_LLVMGPU_ROCDLPASSDETAIL_H_
-
-#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
-#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir::iree_compiler {
-
-#define GEN_PASS_CLASSES
-#include "iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h.inc"
-
-} // namespace mlir::iree_compiler
-
-#endif // IREE_COMPILER_CODEGEN_LLVMGPU_ROCDLPASSDETAIL_H_
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h
index 122eebf71290..696a6c0fab19 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h
@@ -12,23 +12,13 @@
 
 namespace mlir::iree_compiler {
 
-//===----------------------------------------------------------------------===//
-// Passes
-//===----------------------------------------------------------------------===//
-
-/// Creates a pass that calls a dynamic pipeline to progressively lower Linalg
-/// with tensor semantics to ROCDL.
-std::unique_ptr<InterfacePass<FunctionOpInterface>>
-createROCDLLowerExecutableTargetPass();
-
-/// Creates a pass to select the lowering strategy for converting to ROCDL.
-std::unique_ptr<OperationPass<ModuleOp>>
-createROCDLSelectLoweringStrategyPass();
-
 //===----------------------------------------------------------------------===//
 // Pass Registration
 //===----------------------------------------------------------------------===//
 
+#define GEN_PASS_DECL
+#include "iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h.inc" // IWYU pragma: keep
+                                                           //
 void registerCodegenROCDLPasses();
 
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.td
index 625ad1ada90b..bf91b6ebd084 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.td
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLPasses.td
@@ -13,20 +13,16 @@ include "mlir/Pass/PassBase.td"
 // ROCDL Passes (keep alphabetical)
 //===----------------------------------------------------------------------===//
 
-def ROCDLLowerExecutableTarget : InterfacePass<
+def ROCDLLowerExecutableTargetPass : InterfacePass<
     "iree-rocdl-lower-executable-target", "mlir::FunctionOpInterface"> {
   let summary = "Lower an IREE hal.executable.variant op using a suitable "
                 "pass pipeline";
-  let constructor =
-      "mlir::iree_compiler::createROCDLLowerExecutableTargetPass()";
 }
 
-def ROCDLSelectLoweringStrategy :
+def ROCDLSelectLoweringStrategyPass :
     Pass<"iree-rocdl-select-lowering-strategy", "ModuleOp"> {
   let summary = "Select a suitable lowering strategy for an IREE "
                 "hal.executable.variant op";
-  let constructor =
-      "mlir::iree_compiler::createROCDLSelectLoweringStrategyPass()";
 }
 
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLSelectLoweringStrategy.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLSelectLoweringStrategy.cpp
index 8d291ffc5d58..65c855aa2e34 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLSelectLoweringStrategy.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLSelectLoweringStrategy.cpp
@@ -7,17 +7,20 @@
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
 #include "iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.h"
-#include "iree/compiler/Codegen/LLVMGPU/ROCDLPassDetail.h"
 #include "iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir::iree_compiler {
 
+#define GEN_PASS_DEF_ROCDLSELECTLOWERINGSTRATEGYPASS
+#include "iree/compiler/Codegen/LLVMGPU/ROCDLPasses.h.inc"
+
 namespace {
 /// Selects a strategy for lowering an IREE hal.executable.variant to ROCDL.
-class ROCDLSelectLoweringStrategyPass
-    : public ROCDLSelectLoweringStrategyBase<ROCDLSelectLoweringStrategyPass> {
+class ROCDLSelectLoweringStrategyPass final
+    : public impl::ROCDLSelectLoweringStrategyPassBase<
+          ROCDLSelectLoweringStrategyPass> {
 public:
   void getDependentDialects(DialectRegistry &registry) const override {
     registry
@@ -36,10 +39,4 @@ class ROCDLSelectLoweringStrategyPass
   }
 };
 } // namespace
-
-std::unique_ptr<OperationPass<ModuleOp>>
-createROCDLSelectLoweringStrategyPass() {
-  return std::make_unique<ROCDLSelectLoweringStrategyPass>();
-}
-
 } // namespace mlir::iree_compiler