Skip to content

Commit

Permalink
[update] README.md
Browse files Browse the repository at this point in the history
  • Loading branch information
chenghuaWang committed Apr 17, 2024
1 parent 5b8b027 commit 371469b
Show file tree
Hide file tree
Showing 3 changed files with 169 additions and 50 deletions.
58 changes: 54 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,56 @@ func main() {

</details>


#### 2.1.3 Conv2d 3x3

<details>
<summary>[Conv2d 3x3 in nncv's lang(click to expand)]</summary>

```aten
@package = "main";
import "io";
func Conv2d(input Tensor<1, 3, 2048, 2048, float32>,
kernel Tensor<16, 3, 3, 3, float32>,
output Tensor<1, 16, 2046, 2046, float32>) {
pfor (n := 0; 1; 1) {
pfor (k := 0; 16; 1) {
pfor (oh := 0; 2046; 1) {
pfor (ow := 0; 2046; 1) {
pfor (c := 0; 3; 1) {
pfor (r := 0; 3; 1) {
pfor (s := 0; 3; 1) {
output[n, k, oh, ow] = output[n, k, oh, ow] + input[n, c, oh + r, ow + s] * kernel[k, c, r, s];
};
};
};
};
};
};
};
};
func main() {
// NCHW
var input Tensor<1, 3, 2048, 2048, float32>;
// KCRS
var kernel Tensor<16, 3, 3, 3, float32>;
// padding = 0, stride = 1
var output Tensor<1, 16, 2046, 2046, float32>;
// timing.
start := io.clock();
Conv2d(input, kernel, output);
end := io.clock();
io.print(end - start);
io.newLine();
};
```

</details>

### 2.2 Parallel For Loops

Aten-lang provides a `pfor`(parallel-for) mechanism, which will lowering all `pfor` scopes to `affine.for` in mlir. Such as:
Expand All @@ -309,7 +359,7 @@ Aten-lang provides a `pfor`(parallel-for) mechanism, which will lowering all `pf
import "io";
func matmul(lhs Tensor<6, 6, float32>, rhs Tensor<6, 6, float32>, dst Tensor<6, 6, float32>) {
func matmul(lhs Tensor<512, 512, float32>, rhs Tensor<512, 512, float32>, dst Tensor<512, 512, float32>) {
pfor(/*lower bound, set axis name*/i := 0; /*upper bound*/6; /*step*/ 1) {
pfor (j := 0; 6; 1) {
pfor (k := 0; 6; 1) {
Expand All @@ -320,9 +370,9 @@ func matmul(lhs Tensor<6, 6, float32>, rhs Tensor<6, 6, float32>, dst Tensor<6,
};
func main() -> void {
var lhs Tensor<6, 6, float32>;
var rhs Tensor<6, 6, float32>;
var dst Tensor<6, 6, float32>;
var lhs Tensor<512, 512, float32>;
var rhs Tensor<512, 512, float32>;
var dst Tensor<512, 512, float32>;
matmul(lhs, rhs, dst);
Expand Down
145 changes: 109 additions & 36 deletions src/nncv/compiler/Conversion/CodeGen/LlvmGpu/ModernTileGpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ struct Simt {
SmallVector<mlir::Operation*> loopsInEachThreads;
};

struct TileOps {
SmallVector<SmallVector<int64_t>> tileSizes;
SmallVector<bool> canForall;
};

struct ModernMatMulTileOptions {
SmallVector<SmallVector<int64_t>> tileSizes;
SmallVector<bool> canForall;
Expand All @@ -89,6 +94,23 @@ struct ModernConv2dInterfaceTileOptions {
};

// FIXME: Change to fit GPU SIMT loops.
TileOps Solver(mlir::Operation* op) {
mlir::linalg::LinalgOp linalgOp = mlir::cast<mlir::linalg::LinalgOp>(op);
TileOps ret;
// if generic op
if (mlir::isa<mlir::linalg::GenericOp>(linalgOp)) {
auto genericOp = mlir::cast<mlir::linalg::GenericOp>(op);
// TODO
}

// if matmul op
if (mlir::isa<mlir::linalg::MatmulOp>(linalgOp)) {
// TODO
}

printf("[ Erro ] Op: %s is not support yet\n", linalgOp->getName().getStringRef().str().c_str());
return ret;
}

ModernConv2dInterfaceTileOptions getConv2dInterfaceTileOptions(mlir::Operation* op) {
// A method enum all sizes for selecting a output batch size.
Expand Down Expand Up @@ -254,15 +276,31 @@ ModernConv2dInterfaceTileOptions getConv2dInterfaceTileOptions(mlir::Operation*
return res;
}

bool isGenericStyleMatMul(mlir::Operation* op) {
bool isOneReductionAtLast(mlir::Operation* op) {
auto linalgOp = mlir::cast<mlir::linalg::GenericOp>(op);
auto iterTypes = linalgOp.getIteratorTypesArray();
if (iterTypes.size() != 3) return false;
for (size_t i = 0; i < iterTypes.size() - 1; ++i) {
if (iterTypes[i] != utils::IteratorType::parallel) { return false; }
if (iterTypes[i] == utils::IteratorType::reduction) return false;
}
if (iterTypes[iterTypes.size() - 1] != utils::IteratorType::reduction) return false;
return true;
if (iterTypes[iterTypes.size() - 1] == utils::IteratorType::reduction) return true;
return false;
}

std::pair<bool, int> isAllParallelBeforeReduction(mlir::Operation* op) {
bool meetR = false;
int pos = 0;
auto linalgOp = mlir::cast<mlir::linalg::GenericOp>(op);
auto iterTypes = linalgOp.getIteratorTypesArray();
int cnt = 0;
for (auto& item : iterTypes) {
if (item == utils::IteratorType::reduction && !meetR) {
meetR = true;
pos = cnt;
}
if (meetR && item == utils::IteratorType::parallel) return std::make_pair(false, pos);
cnt++;
}
return std::make_pair(true, pos);
}

bool isGenericAllParallel(mlir::Operation* op) {
Expand All @@ -276,45 +314,80 @@ bool isGenericAllParallel(mlir::Operation* op) {

ModernGenericTileOptions getGenericTileOptions(mlir::Operation* op) {
ModernGenericTileOptions ret;
// check if is MatMul
if (isGenericStyleMatMul(op)) {
ret.tileSizes.push_back({8, 32, 0});
ret.tileSizes.push_back({4, 4, 0});
ret.tileSizes.push_back({0, 0, 4});
ret.canForall = {true, true, false};
return ret;
}

// other types generic
// mark forall to true if parallel.
auto linalgOp = mlir::cast<mlir::linalg::GenericOp>(op);
auto iterTypes = linalgOp.getIteratorTypesArray();
for (auto item : iterTypes) {
if (item == utils::IteratorType::parallel)
ret.canForall.push_back(true);
else
ret.canForall.push_back(false);
}

// if all parallel, check the parallel dims
// case 1. all parallel
if (isGenericAllParallel(op)) {
// iter types 4 is for normall generic op which has batch, channel, h, w. But remember that,
// there has a pass will eliminatate batch=1 tensor, so size()==4 actually for batch!=1.
// However, I will not make batch level parallel, due to this kind of full parallels op is
// element-wise, iterate through all batch is ok and has no performance drop.
//
// In brief, I will just tile the innor most parallel, because every element will just be
// visited only once, the vector size is set to 8 for AVX2.
if (iterTypes.size() == 4) { ret.tileSizes.push_back({1, 1, 1, 8}); }
if (iterTypes.size() == 4) {
// block level
ret.tileSizes.push_back({4, 4, 0, 0});

// thread level
ret.tileSizes.push_back({0, 0, 4, 4});
}
// same as size() for 3, 2, 1
if (iterTypes.size() == 3) { ret.tileSizes.push_back({1, 1, 8}); }
if (iterTypes.size() == 2) { ret.tileSizes.push_back({1, 8}); }
if (iterTypes.size() == 1) { ret.tileSizes.push_back({8}); }
if (iterTypes.size() == 3) {
// block level
ret.tileSizes.push_back({4, 0, 0});

// thread level
ret.tileSizes.push_back({0, 0, 4});
}
if (iterTypes.size() == 2) {
// block level
ret.tileSizes.push_back({4, 4});

// thread level
ret.tileSizes.push_back({4, 4});
}
if (iterTypes.size() == 1) {
// block level
ret.tileSizes.push_back({4});

// thread level
ret.tileSizes.push_back({4});
}

// all loops can be made parallel
ret.canForall = SmallVector<bool>(2, true);

return ret;
}

// case 2. Has reduction and reduction is all behind parallel
auto _isReductionAtEnd = isOneReductionAtLast(op);
auto [_isAllParallelBeforeReduction, _pos] = isAllParallelBeforeReduction(op);
if (!_isAllParallelBeforeReduction) {
// all loops can't be parallized
ret.canForall = SmallVector<bool>(iterTypes.size(), false);
return ret;
}

// reduction is tiled with length = 8 too!
if (_isReductionAtEnd) {
if (iterTypes.size() == 1) {
ret.tileSizes.push_back({8});
ret.canForall = {false};
} else if (iterTypes.size() == 2) {
ret.tileSizes.push_back({8, 8});
ret.canForall = {true, false};
} else if (iterTypes.size() == 3) {
ret.tileSizes.push_back({8, 8, 8});
ret.canForall = {true, true, false};
} else if (iterTypes.size() == 4) {
ret.tileSizes.push_back({8, 8, 8, 8});
ret.canForall = {true, true, true, false};
}

return ret;
}

// if has reduction inside parallel, see ['parallel', 'reduction', 'parallel']. It is not supposed
// to happen. Before tiling, there has a pass interchange the reduction loop to the innermost for
// loops.
if (_isAllParallelBeforeReduction) {
printf("[ Erro ] The Multi Reduction op is not supportted yet\n");
std::exit(-1);
}

return ret;
}
Expand Down
16 changes: 6 additions & 10 deletions src/nncv/compiler/Pipeline/DnnModelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,8 +288,6 @@ void DnnModelLowering::run() {
runPmWithExit(pm, m_Module, "Pass Pipeline-3: Perform gpu based tiling and map");
}

goto nv_pipeline_exit;

//===----------------------------------------------------------------------===//
// 5. Prepare vec For GPU !!!
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -353,8 +351,6 @@ void DnnModelLowering::run() {
"xxxxxxxxxxxxxxxxxxxx: Lowering all vector dialect to gpu or tensor core directly");
}

goto nv_pipeline_exit;

//===----------------------------------------------------------------------===//
// 8. Map to Blocks and Threads using builtin pass
// Map parallel to gpu's dimension Greedily
Expand All @@ -363,12 +359,12 @@ void DnnModelLowering::run() {
pm.clear();
pm.addNestedPass<mlir::func::FuncOp>(mlir::nncv::createLoweringScfForAllToParallelPass());
pm.addNestedPass<mlir::func::FuncOp>(mlir::createGpuMapParallelLoopsPass());
pm.addPass(mlir::createParallelLoopToGpuPass());
pm.addPass(mlir::createGpuKernelOutliningPass());
// register host memory to device side.
pm.addNestedPass<mlir::func::FuncOp>(mlir::nncv::createRegisterMemToGpuPass());
pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
pm.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
// pm.addPass(mlir::createParallelLoopToGpuPass());
// pm.addPass(mlir::createGpuKernelOutliningPass());
// // register host memory to device side.
// pm.addNestedPass<mlir::func::FuncOp>(mlir::nncv::createRegisterMemToGpuPass());
// pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
// pm.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
runPmWithExit(pm, m_Module, "Pass Pipeline-7: Forall to Parallel and do Mapping to device");
}

Expand Down

0 comments on commit 371469b

Please sign in to comment.