[update] README.md

chenghuaWang · Apr 17, 2024 · 371469b · 371469b
1 parent 5b8b027
commit 371469b
Show file tree

Hide file tree

Showing 3 changed files with 169 additions and 50 deletions.
diff --git a/README.md b/README.md
@@ -300,6 +300,56 @@ func main() {
 
 </details>
 
+
+#### 2.1.3 Conv2d 3x3
+
+<details>
+<summary>[Conv2d 3x3 in nncv's lang(click to expand)]</summary>
+
+```aten
+@package = "main";
+
+import "io";
+
+func Conv2d(input Tensor<1, 3, 2048, 2048, float32>,
+            kernel Tensor<16, 3, 3, 3, float32>,
+            output Tensor<1, 16, 2046, 2046, float32>) {
+    pfor (n := 0; 1; 1) {
+        pfor (k := 0; 16; 1) {
+            pfor (oh := 0; 2046; 1) {
+                pfor (ow := 0; 2046; 1) {
+                    pfor (c := 0; 3; 1) {
+                        pfor (r := 0; 3; 1) {
+                            pfor (s := 0; 3; 1) {
+                                output[n, k, oh, ow] = output[n, k, oh, ow] + input[n, c, oh + r, ow + s] * kernel[k, c, r, s];
+                            };
+                        };
+                    };
+                };
+            };
+        };
+    };
+};
+
+func main() {
+    // NCHW
+    var input Tensor<1, 3, 2048, 2048, float32>;
+    // KCRS
+    var kernel Tensor<16, 3, 3, 3, float32>;
+    // padding = 0, stride = 1
+    var output Tensor<1, 16, 2046, 2046, float32>;
+
+    // timing.
+    start := io.clock();
+    Conv2d(input, kernel, output);
+    end := io.clock();
+    io.print(end - start);
+    io.newLine();
+};
+```
+
+</details>
+
 ### 2.2 Parallel For Loops
 
 Aten-lang provides a `pfor`(parallel-for) mechanism, which will lowering all `pfor` scopes to `affine.for` in mlir. Such as:
@@ -309,7 +359,7 @@ Aten-lang provides a `pfor`(parallel-for) mechanism, which will lowering all `pf
 
 import "io";
 
-func matmul(lhs Tensor<6, 6, float32>, rhs Tensor<6, 6, float32>, dst Tensor<6, 6, float32>) {
+func matmul(lhs Tensor<512, 512, float32>, rhs Tensor<512, 512, float32>, dst Tensor<512, 512, float32>) {
     pfor(/*lower bound, set axis name*/i := 0; /*upper bound*/6; /*step*/ 1) {
         pfor (j := 0; 6; 1) {
             pfor (k := 0; 6; 1) {
@@ -320,9 +370,9 @@ func matmul(lhs Tensor<6, 6, float32>, rhs Tensor<6, 6, float32>, dst Tensor<6,
 };
 
 func main() -> void {
-    var lhs Tensor<6, 6, float32>;
-    var rhs Tensor<6, 6, float32>;
-    var dst Tensor<6, 6, float32>;
+    var lhs Tensor<512, 512, float32>;
+    var rhs Tensor<512, 512, float32>;
+    var dst Tensor<512, 512, float32>;
 
     matmul(lhs, rhs, dst);
 

diff --git a/src/nncv/compiler/Conversion/CodeGen/LlvmGpu/ModernTileGpu.cpp b/src/nncv/compiler/Conversion/CodeGen/LlvmGpu/ModernTileGpu.cpp
@@ -73,6 +73,11 @@ struct Simt {
   SmallVector<mlir::Operation*> loopsInEachThreads;
 };
 
+struct TileOps {
+  SmallVector<SmallVector<int64_t>> tileSizes;
+  SmallVector<bool> canForall;
+};
+
 struct ModernMatMulTileOptions {
   SmallVector<SmallVector<int64_t>> tileSizes;
   SmallVector<bool> canForall;
@@ -89,6 +94,23 @@ struct ModernConv2dInterfaceTileOptions {
 };
 
 // FIXME: Change to fit GPU SIMT loops.
+TileOps Solver(mlir::Operation* op) {
+  mlir::linalg::LinalgOp linalgOp = mlir::cast<mlir::linalg::LinalgOp>(op);
+  TileOps ret;
+  // if generic op
+  if (mlir::isa<mlir::linalg::GenericOp>(linalgOp)) {
+    auto genericOp = mlir::cast<mlir::linalg::GenericOp>(op);
+    // TODO
+  }
+
+  // if matmul op
+  if (mlir::isa<mlir::linalg::MatmulOp>(linalgOp)) {
+    // TODO
+  }
+
+  printf("[ Erro ] Op: %s is not support yet\n", linalgOp->getName().getStringRef().str().c_str());
+  return ret;
+}
 
 ModernConv2dInterfaceTileOptions getConv2dInterfaceTileOptions(mlir::Operation* op) {
   // A method enum all sizes for selecting a output batch size.
@@ -254,15 +276,31 @@ ModernConv2dInterfaceTileOptions getConv2dInterfaceTileOptions(mlir::Operation*
   return res;
 }
 
-bool isGenericStyleMatMul(mlir::Operation* op) {
+bool isOneReductionAtLast(mlir::Operation* op) {
   auto linalgOp = mlir::cast<mlir::linalg::GenericOp>(op);
   auto iterTypes = linalgOp.getIteratorTypesArray();
-  if (iterTypes.size() != 3) return false;
   for (size_t i = 0; i < iterTypes.size() - 1; ++i) {
-    if (iterTypes[i] != utils::IteratorType::parallel) { return false; }
+    if (iterTypes[i] == utils::IteratorType::reduction) return false;
   }
-  if (iterTypes[iterTypes.size() - 1] != utils::IteratorType::reduction) return false;
-  return true;
+  if (iterTypes[iterTypes.size() - 1] == utils::IteratorType::reduction) return true;
+  return false;
+}
+
+std::pair<bool, int> isAllParallelBeforeReduction(mlir::Operation* op) {
+  bool meetR = false;
+  int pos = 0;
+  auto linalgOp = mlir::cast<mlir::linalg::GenericOp>(op);
+  auto iterTypes = linalgOp.getIteratorTypesArray();
+  int cnt = 0;
+  for (auto& item : iterTypes) {
+    if (item == utils::IteratorType::reduction && !meetR) {
+      meetR = true;
+      pos = cnt;
+    }
+    if (meetR && item == utils::IteratorType::parallel) return std::make_pair(false, pos);
+    cnt++;
+  }
+  return std::make_pair(true, pos);
 }
 
 bool isGenericAllParallel(mlir::Operation* op) {
@@ -276,45 +314,80 @@ bool isGenericAllParallel(mlir::Operation* op) {
 
 ModernGenericTileOptions getGenericTileOptions(mlir::Operation* op) {
   ModernGenericTileOptions ret;
-  // check if is MatMul
-  if (isGenericStyleMatMul(op)) {
-    ret.tileSizes.push_back({8, 32, 0});
-    ret.tileSizes.push_back({4, 4, 0});
-    ret.tileSizes.push_back({0, 0, 4});
-    ret.canForall = {true, true, false};
-    return ret;
-  }
 
-  // other types generic
-  // mark forall to true if parallel.
   auto linalgOp = mlir::cast<mlir::linalg::GenericOp>(op);
   auto iterTypes = linalgOp.getIteratorTypesArray();
-  for (auto item : iterTypes) {
-    if (item == utils::IteratorType::parallel)
-      ret.canForall.push_back(true);
-    else
-      ret.canForall.push_back(false);
-  }
 
-  // if all parallel, check the parallel dims
+  // case 1. all parallel
   if (isGenericAllParallel(op)) {
-    // iter types 4 is for normall generic op which has batch, channel, h, w. But remember that,
-    // there has a pass will eliminatate batch=1 tensor, so size()==4 actually for batch!=1.
-    // However, I will not make batch level parallel, due to this kind of full parallels op is
-    // element-wise, iterate through all batch is ok and has no performance drop.
-    //
-    // In brief, I will just tile the innor most parallel, because every element will just be
-    // visited only once, the vector size is set to 8 for AVX2.
-    if (iterTypes.size() == 4) { ret.tileSizes.push_back({1, 1, 1, 8}); }
+    if (iterTypes.size() == 4) {
+      // block level
+      ret.tileSizes.push_back({4, 4, 0, 0});
+
+      // thread level
+      ret.tileSizes.push_back({0, 0, 4, 4});
+    }
     // same as size() for 3, 2, 1
-    if (iterTypes.size() == 3) { ret.tileSizes.push_back({1, 1, 8}); }
-    if (iterTypes.size() == 2) { ret.tileSizes.push_back({1, 8}); }
-    if (iterTypes.size() == 1) { ret.tileSizes.push_back({8}); }
+    if (iterTypes.size() == 3) {
+      // block level
+      ret.tileSizes.push_back({4, 0, 0});
+
+      // thread level
+      ret.tileSizes.push_back({0, 0, 4});
+    }
+    if (iterTypes.size() == 2) {
+      // block level
+      ret.tileSizes.push_back({4, 4});
+
+      // thread level
+      ret.tileSizes.push_back({4, 4});
+    }
+    if (iterTypes.size() == 1) {
+      // block level
+      ret.tileSizes.push_back({4});
+
+      // thread level
+      ret.tileSizes.push_back({4});
+    }
+
+    // all loops can be made parallel
+    ret.canForall = SmallVector<bool>(2, true);
+
+    return ret;
+  }
+
+  // case 2. Has reduction and reduction is all behind parallel
+  auto _isReductionAtEnd = isOneReductionAtLast(op);
+  auto [_isAllParallelBeforeReduction, _pos] = isAllParallelBeforeReduction(op);
+  if (!_isAllParallelBeforeReduction) {
+    // all loops can't be parallized
+    ret.canForall = SmallVector<bool>(iterTypes.size(), false);
+    return ret;
+  }
+
+  // reduction is tiled with length = 8 too!
+  if (_isReductionAtEnd) {
+    if (iterTypes.size() == 1) {
+      ret.tileSizes.push_back({8});
+      ret.canForall = {false};
+    } else if (iterTypes.size() == 2) {
+      ret.tileSizes.push_back({8, 8});
+      ret.canForall = {true, false};
+    } else if (iterTypes.size() == 3) {
+      ret.tileSizes.push_back({8, 8, 8});
+      ret.canForall = {true, true, false};
+    } else if (iterTypes.size() == 4) {
+      ret.tileSizes.push_back({8, 8, 8, 8});
+      ret.canForall = {true, true, true, false};
+    }
+
+    return ret;
   }
 
-  // if has reduction inside parallel, see ['parallel', 'reduction', 'parallel']. It is not supposed
-  // to happen. Before tiling, there has a pass interchange the reduction loop to the innermost for
-  // loops.
+  if (_isAllParallelBeforeReduction) {
+    printf("[ Erro ] The Multi Reduction op is not supportted yet\n");
+    std::exit(-1);
+  }
 
   return ret;
 }

diff --git a/src/nncv/compiler/Pipeline/DnnModelLowering.cpp b/src/nncv/compiler/Pipeline/DnnModelLowering.cpp
@@ -288,8 +288,6 @@ void DnnModelLowering::run() {
       runPmWithExit(pm, m_Module, "Pass Pipeline-3: Perform gpu based tiling and map");
     }
 
-    goto nv_pipeline_exit;
-
     //===----------------------------------------------------------------------===//
     // 5. Prepare vec For GPU !!!
     //===----------------------------------------------------------------------===//
@@ -353,8 +351,6 @@ void DnnModelLowering::run() {
           "xxxxxxxxxxxxxxxxxxxx: Lowering all vector dialect to gpu or tensor core directly");
     }
 
-    goto nv_pipeline_exit;
-
     //===----------------------------------------------------------------------===//
     // 8. Map to Blocks and Threads using builtin pass
     // Map parallel to gpu's dimension Greedily
@@ -363,12 +359,12 @@ void DnnModelLowering::run() {
       pm.clear();
       pm.addNestedPass<mlir::func::FuncOp>(mlir::nncv::createLoweringScfForAllToParallelPass());
       pm.addNestedPass<mlir::func::FuncOp>(mlir::createGpuMapParallelLoopsPass());
-      pm.addPass(mlir::createParallelLoopToGpuPass());
-      pm.addPass(mlir::createGpuKernelOutliningPass());
-      // register host memory to device side.
-      pm.addNestedPass<mlir::func::FuncOp>(mlir::nncv::createRegisterMemToGpuPass());
-      pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
-      pm.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
+      // pm.addPass(mlir::createParallelLoopToGpuPass());
+      // pm.addPass(mlir::createGpuKernelOutliningPass());
+      // // register host memory to device side.
+      // pm.addNestedPass<mlir::func::FuncOp>(mlir::nncv::createRegisterMemToGpuPass());
+      // pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+      // pm.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
       runPmWithExit(pm, m_Module, "Pass Pipeline-7: Forall to Parallel and do Mapping to device");
     }