Skip to content

Releases: chenghuaWang/nncv

Testing on more Models and Aten bugs fixed.

17 Apr 07:13
Choose a tag to compare


  • MobileNetV3. Using -target [Native/AdvX86]
  • SqueezeNet. Using -target [Native/AdvX86]
  • Transformer block write in aten-lang. Code
  • Conv2d 3x3 write in aten-lang. Code

New features

  • math package support in aten-lang. See Code
  • Turn arith op to Affine Expr when processing affine.for's Induction value.
  • New target:
    • Native: using only MLIR's builtin Pass
    • AdvX86: using scf.for loops with dispatch info and split for loops to avoid vector.mask

Full Changelog: v1.0...v1.1

DL Model compile Support for x86 target

27 Feb 14:14
Choose a tag to compare

The nncv compiler now support DL Model compilation for x86 target.

Currently, nncv supports a very simple lowering pipeline. It basicly uses tiling and vectorization on linalg.ops. And this vectorization method currently only supports CPUs with the AVX2 feature.

If you want to compile a DL model to cpu target(without parallel). You can use commands below to generate a object file:

nncv-c -warp-c-interface -target HostWoParallel res18.mlir -o optimizedRes18.mlir
mlir-translate -mlir-to-llvmir optimizedRes18.mlir -o res18.ll
llc -filetype=object res18.ll -o libres18.o

If you want to enable multi-threads on cpu target, use HostWParallel option instead:

nncv-c -warp-c-interface -target HostWParallel res18.mlir -o optimizedRes18.mlir

nncv's aten support polyhedral now

09 Feb 13:09
Choose a tag to compare

What's new

  • Polyhedral model support for aten ir.

For now, nncv's aten ir can be transformed using polyhedral model provided by polymer. I will provide a simple example hereby.

The nncv's compiler will do 3 stages lowering(aten-lang-->aten dialect-->mlir's dialects-->llvm ir) and use LLVM's JIT to execute it. Aten-lang provides a pfor(parallel-for) mechanism, which will lowering all pfor scopes to affine.for in mlir. Such as:

@package = "main";

import "io";

func matmul(lhs Tensor<6, 6, float32>, rhs Tensor<6, 6, float32>, dst Tensor<6, 6, float32>) {
    pfor(/*lower bound, set axis name*/i := 0; /*upper bound*/6; /*step*/ 1) {
        pfor (j := 0; 6; 1) {
            pfor (k := 0; 6; 1) {
                dst[i, j] = dst[i, j] + lhs[i, k] * rhs[k, j]; // do 

func main() -> void {
    var lhs Tensor<6, 6, float32>;
    var rhs Tensor<6, 6, float32>;
    var dst Tensor<6, 6, float32>;

    matmul(lhs, rhs, dst);


after lowering to atenir-mlir, we will get:

module @__main {
  Aten.func private @matmul(%arg0: memref<512x512xf32>, %arg1: memref<512x512xf32>, %arg2: memref<512x512xf32>) {
    affine.for %arg3 = 0 to 512 {
      affine.for %arg4 = 0 to 512 {
        affine.for %arg5 = 0 to 512 {
          %0 = memref.load %arg2[%arg3, %arg4] : memref<512x512xf32>
          %1 = memref.load %arg0[%arg3, %arg5] : memref<512x512xf32>
          %2 = memref.load %arg1[%arg5, %arg4] : memref<512x512xf32>
          %3 = Aten.binop(mul, %1, %2) : f32
          %4 = Aten.binop(add, %0, %3) : f32
 %4, %arg2[%arg3, %arg4] : memref<512x512xf32>
  Aten.func private @main() {
    %alloc = memref.alloc() : memref<512x512xf32>
    %alloc_0 = memref.alloc() : memref<512x512xf32>
    %alloc_1 = memref.alloc() : memref<512x512xf32> @matmul(%alloc, %alloc_0, %alloc_1) : (memref<512x512xf32>, memref<512x512xf32>, memref<512x512xf32>) -> ()

then lowering all aten-ir to mlir:

module @__main {
  func.func private @matmul(%arg0: memref<512x512xf32>, %arg1: memref<512x512xf32>, %arg2: memref<512x512xf32>) {
    affine.for %arg3 = 0 to 512 {
      affine.for %arg4 = 0 to 512 {
        affine.for %arg5 = 0 to 512 {
          %0 = memref.load %arg2[%arg3, %arg4] : memref<512x512xf32>
          %1 = memref.load %arg0[%arg3, %arg5] : memref<512x512xf32>
          %2 = memref.load %arg1[%arg5, %arg4] : memref<512x512xf32>
          %3 = arith.mulf %1, %2 : f32
          %4 = arith.addf %0, %3 : f32
 %4, %arg2[%arg3, %arg4] : memref<512x512xf32>
  func.func private @main() {
    %alloc = memref.alloc() : memref<512x512xf32>
    %alloc_0 = memref.alloc() : memref<512x512xf32>
    %alloc_1 = memref.alloc() : memref<512x512xf32>
    call @matmul(%alloc, %alloc_0, %alloc_1) : (memref<512x512xf32>, memref<512x512xf32>, memref<512x512xf32>) -> ()

nncv will try to use polymer to optimize all affine loops. (memref(loadOp, storeOp) will raise to affine if necessary). After optimization, we will get:

#map = affine_map<(d0) -> (d0 * 32)>
#map1 = affine_map<(d0) -> (d0 * 32 + 32)>
module @__main {
  func.func private @S0(%arg0: memref<512x512xf32>, %arg1: index, %arg2: index, %arg3: memref<512x512xf32>, %arg4: index, %arg5: memref<512x512xf32>) attributes {scop.stmt} {
    %0 = affine.load %arg0[symbol(%arg1), symbol(%arg2)] : memref<512x512xf32>
    %1 = affine.load %arg5[symbol(%arg1), symbol(%arg4)] : memref<512x512xf32>
    %2 = affine.load %arg3[symbol(%arg4), symbol(%arg2)] : memref<512x512xf32>
    %3 = arith.mulf %1, %2 : f32
    %4 = arith.addf %0, %3 : f32 %4, %arg0[symbol(%arg1), symbol(%arg2)] : memref<512x512xf32>
  func.func private @matmul(%arg0: memref<512x512xf32>, %arg1: memref<512x512xf32>, %arg2: memref<512x512xf32>) {
    affine.for %arg3 = 0 to 16 {
      affine.for %arg4 = 0 to 16 {
        affine.for %arg5 = 0 to 16 {
          affine.for %arg6 = #map(%arg3) to #map1(%arg3) {
            affine.for %arg7 = #map(%arg5) to #map1(%arg5) {
              affine.for %arg8 = #map(%arg4) to #map1(%arg4) {
       @S0(%arg2, %arg6, %arg8, %arg1, %arg7, %arg0) : (memref<512x512xf32>, index, index, memref<512x512xf32>, index, memref<512x512xf32>) -> ()
  func.func private @main() {
    %alloc = memref.alloc() : memref<512x512xf32>
    %alloc_0 = memref.alloc() : memref<512x512xf32>
    %alloc_1 = memref.alloc() : memref<512x512xf32>
    call @matmul(%alloc, %alloc_0, %alloc_1) : (memref<512x512xf32>, memref<512x512xf32>, memref<512x512xf32>) -> ()
    memref.dealloc %alloc_1 : memref<512x512xf32>
    memref.dealloc %alloc_0 : memref<512x512xf32>
    memref.dealloc %alloc : memref<512x512xf32>

Finally, nncv's lowering pipeline will lowering mlir to llvm ir. More examples can be found at test directory.

nncv lang's frontend almost done.

24 Jan 06:18
Choose a tag to compare
  1. Add more test cases.
  2. Aten-lang lowering to mlir, almost done.

Case 1: Parallel For Loops


 * author: (
 * brief: test aten-lang frontend code generation
@package = "main";

func matmul(lhs Tensor<512, 512, float32>, rhs Tensor<512, 512, float32>, dst Tensor<512, 512, float32>) {
    pfor(/*lower bound, set axis name*/i := 0; /*upper bound*/512; /*step*/ 1) {
        pfor (j := 0; 512; 1) {
            pfor (k := 0; 512; 1) {
                dst[i, j] = dst[i, j] + lhs[i, k] * rhs[k, j]; // do 

func main() -> void {
    var lhs Tensor<512, 512, float32>;
    var rhs Tensor<512, 512, float32>;
    var dst Tensor<512, 512, float32>;

    matmul(lhs, rhs, dst);

lowering to Pfor.air:

module @__main {
  Aten.func private @matmul(%arg0: memref<512x512xf32>, %arg1: memref<512x512xf32>, %arg2: memref<512x512xf32>) {
    affine.for %arg3 = 0 to 512 {
      affine.for %arg4 = 0 to 512 {
        affine.for %arg5 = 0 to 512 {
          %0 = memref.load %arg2[%arg3, %arg4] : memref<512x512xf32>
          %1 = memref.load %arg0[%arg3, %arg5] : memref<512x512xf32>
          %2 = memref.load %arg1[%arg5, %arg4] : memref<512x512xf32>
          %3 = Aten.binop(mul, %1, %2) : f32
          %4 = Aten.binop(add, %0, %3) : f32
 %4, %arg2[%arg3, %arg4] : memref<512x512xf32>
  Aten.func private @main() {
    %alloc = memref.alloc() : memref<512x512xf32>
    %alloc_0 = memref.alloc() : memref<512x512xf32>
    %alloc_1 = memref.alloc() : memref<512x512xf32> @matmul(%alloc, %alloc_0, %alloc_1) : (memref<512x512xf32>, memref<512x512xf32>, memref<512x512xf32>) -> ()

Lowering to mlir:

module @__main {
  func.func private @matmul(%arg0: memref<512x512xf32>, %arg1: memref<512x512xf32>, %arg2: memref<512x512xf32>) {
    affine.for %arg3 = 0 to 512 {
      affine.for %arg4 = 0 to 512 {
        affine.for %arg5 = 0 to 512 {
          %0 = memref.load %arg2[%arg3, %arg4] : memref<512x512xf32>
          %1 = memref.load %arg0[%arg3, %arg5] : memref<512x512xf32>
          %2 = memref.load %arg1[%arg5, %arg4] : memref<512x512xf32>
          %3 = arith.mulf %1, %2 : f32
          %4 = arith.addf %0, %3 : f32
 %4, %arg2[%arg3, %arg4] : memref<512x512xf32>
  func.func private @main() {
    %alloc = memref.alloc() : memref<512x512xf32>
    %alloc_0 = memref.alloc() : memref<512x512xf32>
    %alloc_1 = memref.alloc() : memref<512x512xf32>
    call @matmul(%alloc, %alloc_0, %alloc_1) : (memref<512x512xf32>, memref<512x512xf32>, memref<512x512xf32>) -> ()

Case 2 External Function Call


@package = "main";

func _lib_nncv_do_something(Tensor<1, 1, float32>);

pub func add(a int32, b int32) -> int32 {
    return a + b;

func main() {
    res := add(8, 8);
    var t Tensor<1, 1, float32>;

Lowering to FuncCall.air

module @__main {
  Aten.func private @_lib_nncv_do_something(memref<1x1xf32>)
  Aten.func public @add(%arg0: !<s, 32>, %arg1: !<s, 32>) -> !<s, 32> {
    %0 = Aten.binop(add, %arg0, %arg1) : !<s, 32>
    Aten.return %0 : !<s, 32>
  Aten.func private @main() {
    %0 = Aten.const(<8> : !<s, 32>) : !<s, 32>
    %1 = @add(%0, %0) : (!<s, 32>, !<s, 32>) -> !<s, 32>
    %2 = Aten.alloca !<s, 32>, aten.ptr <!<s, 32>>, ["res"] {alignment = 4 : i64} %1, %2 : !<s, 32>, aten.ptr <!<s, 32>>
    %alloc = memref.alloc() : memref<1x1xf32> @_lib_nncv_do_something(%alloc) : (memref<1x1xf32>) -> ()

Lowering to mlir

module @__main {
  func.func private @_lib_nncv_do_something(memref<1x1xf32>)
  func.func @add(%arg0: i32, %arg1: i32) -> i32 {
    %0 = arith.addi %arg0, %arg1 : i32
    return %0 : i32
  func.func private @main() {
    %c8_i32 = arith.constant 8 : i32
    %0 = call @add(%c8_i32, %c8_i32) : (i32, i32) -> i32
    %alloca = memref.alloca() {alignment = 4 : i64} : memref<i32> %0, %alloca[] : memref<i32>
    %alloc = memref.alloc() : memref<1x1xf32>
    call @_lib_nncv_do_something(%alloc) : (memref<1x1xf32>) -> ()

Check ./test for more examples.

v1.0-beta.2 release.

06 Jan 14:12
Choose a tag to compare
v1.0-beta.2 release. Pre-release

freeze src2. For backup.

Add new GPU Lowering Pipeline.
Add NNCV Frontend and Graph IR/Transforms.
Add Graph Level Optimization.

v1.0-beta.1 release.

29 Dec 12:23
Choose a tag to compare

freeze src1. For backup.