diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 7d84eeb..050a6f4 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -29,27 +29,27 @@ jobs:
           - name: "Ubuntu OpenMPI g++"
             CC_COMPILER: gcc
             CXX_COMPILER: g++
-            container: "ubuntu_openmpi"
+            container: "ubuntu_gcc_openmpi"
             MPIEXEC_PREFLAGS: "--allow-run-as-root --oversubscribe"
             USE_SANITIZER: ""
             CMAKE_BUILD_TYPE: Release
-            DOCKER_OPTIONS: " "
+            DOCKER_OPTIONS: "--cap-add SYS_PTRACE"
             CODE_COVERAGE: "OFF"
 
           - name: "Ubuntu OpenMPI clang++"
             CC_COMPILER: clang
             CXX_COMPILER: clang++
-            container: "ubuntu_openmpi"
+            container: "ubuntu_clang_openmpi"
             MPIEXEC_PREFLAGS: "--allow-run-as-root --oversubscribe"
             USE_SANITIZER: ""
             CMAKE_BUILD_TYPE: Release
-            DOCKER_OPTIONS: " "
+            DOCKER_OPTIONS: "--cap-add SYS_PTRACE"
             CODE_COVERAGE: "OFF"
 
           - name: "Ubuntu MPICH g++"
             CC_COMPILER: gcc
             CXX_COMPILER: g++
-            container: "ubuntu_mpich"
+            container: "ubuntu_gcc_mpich"
             MPIEXEC_PREFLAGS: ""
             CMAKE_BUILD_TYPE: Debug
             DOCKER_OPTIONS: " "
@@ -58,35 +58,37 @@ jobs:
           - name: "Ubuntu MPICH clang++"
             CC_COMPILER: clang
             CXX_COMPILER: clang++
-            container: "ubuntu_mpich"
+            container: "ubuntu_clang_mpich"
             MPIEXEC_PREFLAGS: ""
             CMAKE_BUILD_TYPE: Release
             DOCKER_OPTIONS: " "
             CODE_COVERAGE: "OFF"
 
           # Hangs on github
-          # - name: "Debian OpenMPI g++"
-          #   CC_COMPILER: gcc
-          #   CXX_COMPILER: g++
-          #   container: "debian_openmpi"
-          #   MPIEXEC_PREFLAGS: "--allow-run-as-root --oversubscribe --mca btl_vader_single_copy_mechanism none"
-          #   USE_SANITIZER: ""
-          #   CMAKE_BUILD_TYPE: Debug
-          #   DOCKER_OPTIONS: "--cap-add SYS_PTRACE"
-
-          # - name: "Debian OpenMPI clang++"
-          #   CC_COMPILER: clang
-          #   CXX_COMPILER: clang++
-          #   container: "debian_openmpi"
-          #   MPIEXEC_PREFLAGS: "--allow-run-as-root --oversubscribe --mca btl_vader_single_copy_mechanism none"
-          #   USE_SANITIZER: ""
-          #   CMAKE_BUILD_TYPE: Debug
-          #   DOCKER_OPTIONS: "--cap-add SYS_PTRACE"
+          - name: "Debian OpenMPI g++"
+            CC_COMPILER: gcc
+            CXX_COMPILER: g++
+            container: "debian_gcc_openmpi"
+            MPIEXEC_PREFLAGS: "--allow-run-as-root --oversubscribe"
+            USE_SANITIZER: ""
+            CMAKE_BUILD_TYPE: Release
+            DOCKER_OPTIONS: "--cap-add SYS_PTRACE"
+            CODE_COVERAGE: "OFF"
+
+          - name: "Debian OpenMPI clang++"
+            CC_COMPILER: clang
+            CXX_COMPILER: clang++
+            container: "debian_clang_openmpi"
+            MPIEXEC_PREFLAGS: "--allow-run-as-root --oversubscribe"
+            USE_SANITIZER: ""
+            CMAKE_BUILD_TYPE: Release
+            DOCKER_OPTIONS: "--cap-add SYS_PTRACE"
+            CODE_COVERAGE: "OFF"
 
           - name: "Debian MPICH g++"
             CC_COMPILER: gcc
             CXX_COMPILER: g++
-            container: "debian_mpich"
+            container: "debian_gcc_mpich"
             MPIEXEC_PREFLAGS: ""
             CMAKE_BUILD_TYPE: Release
             DOCKER_OPTIONS: " "
@@ -95,7 +97,7 @@ jobs:
           - name: "Debian MPICH clang++"
             CC_COMPILER: clang
             CXX_COMPILER: clang++
-            container: "debian_mpich"
+            container: "debian_clang_mpich"
             MPIEXEC_PREFLAGS: ""
             CMAKE_BUILD_TYPE: Release
             DOCKER_OPTIONS: " "
@@ -115,7 +117,7 @@ jobs:
     if: "!contains(github.event.head_commit.message, '[ci skip]')"
     steps:
       - name: Checkout htool-python
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
         with:
           submodules: "true"
 
@@ -141,7 +143,9 @@ jobs:
 
       - name: Check format
         if: matrix.CODE_COVERAGE == 'ON'
+        # https://github.com/actions/checkout/pull/762
         run: |
+          git config --global --add safe.directory `pwd` 
           cd build
           make format
           make cmake-format
diff --git a/lib/hpddm b/lib/hpddm
index 9c5092c..d19056d 160000
--- a/lib/hpddm
+++ b/lib/hpddm
@@ -1 +1 @@
-Subproject commit 9c5092ca711678ebc257ffadc7d29057d4566a87
+Subproject commit d19056d0e3010b905fca5b73f5c48a766b8a042d
diff --git a/lib/htool b/lib/htool
index b6e9169..2343b2b 160000
--- a/lib/htool
+++ b/lib/htool
@@ -1 +1 @@
-Subproject commit b6e91690f8d7c20d67dfb3b8db2e7ea674405a37
+Subproject commit 2343b2b17bdeac8dd867a9f1337f1f083ddab198
diff --git a/src/htool/cluster.hpp b/src/htool/cluster.hpp
index 6fbb7e5..255360d 100644
--- a/src/htool/cluster.hpp
+++ b/src/htool/cluster.hpp
@@ -16,7 +16,6 @@ using namespace htool;
 
 template <typename ClusterType>
 void declare_Cluster(py::module &m, const std::string &className) {
-
     py::class_<ClusterType, std::shared_ptr<ClusterType>, VirtualCluster> py_class(m, className.c_str());
     py_class.def(py::init<int>());
     py_class.def(
@@ -55,6 +54,7 @@ void declare_Cluster(py::module &m, const std::string &className) {
         }
         return py::array_t<int, py::array::f_style>(shape, masteroffset_raw.data());
     });
+    py_class.def("get_global_perm", overload_cast_<int>()(&ClusterType::get_global_perm, py::const_));
     py_class.def(
         "display", [](ClusterType &self, py::array_t<double, py::array::f_style | py::array::forcecast> x, int depth, bool show, MPI_Comm_wrapper comm) {
             int rankWorld;
@@ -74,7 +74,7 @@ void declare_Cluster(py::module &m, const std::string &className) {
                 // Permuted geometric points
                 for (int i = 0; i < size; ++i) {
                     for (int p = 0; p < space_dim; p++) {
-                        output[i + size * p] = x.at(p, root->get_perm(i));
+                        output[i + size * p] = x.at(p, root->get_global_perm(i));
                     }
                 }
 
@@ -143,4 +143,4 @@ void declare_Cluster(py::module &m, const std::string &className) {
     py_class.def("set_minclustersize", &ClusterType::set_minclustersize);
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/htool/ddm_solver.hpp b/src/htool/ddm_solver.hpp
index 770fa67..042418b 100644
--- a/src/htool/ddm_solver.hpp
+++ b/src/htool/ddm_solver.hpp
@@ -98,4 +98,4 @@ void declare_DDM(py::module &m, const std::string &className) {
     py_class.def("get_infos", &Class::get_infos);
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/htool/hmatrix.hpp b/src/htool/hmatrix.hpp
index 6a2e8a1..acd4f07 100644
--- a/src/htool/hmatrix.hpp
+++ b/src/htool/hmatrix.hpp
@@ -9,6 +9,7 @@
 #include "lrmat_generator.hpp"
 #include "matrix.hpp"
 #include "misc.hpp"
+#include "off_diagonal_approximation.hpp"
 #include "wrapper_mpi.hpp"
 #include <htool/htool.hpp>
 
@@ -48,6 +49,9 @@ void declare_HMatrix(py::module &m, const std::string &baseclassName, const std:
     py_class.def("set_compression", [](Class &self, std::shared_ptr<VirtualLowRankGeneratorCpp<T>> mat) {
         self.set_compression(mat);
     });
+    py_class.def("set_off_diagonal_approximation", [](Class &self, std::shared_ptr<VirtualOffDiagonalApproximation<T>> mat) {
+        self.set_off_diagonal_approximation(mat);
+    });
 
     // Getters
     py_class.def_property_readonly("shape", [](const Class &self) {
@@ -57,6 +61,14 @@ void declare_HMatrix(py::module &m, const std::string &baseclassName, const std:
     py_class.def("get_perm_s", overload_cast_<>()(&Class::get_perms, py::const_));
     py_class.def("get_MasterOffset_t", overload_cast_<>()(&Class::get_MasterOffset_t, py::const_));
     py_class.def("get_MasterOffset_s", overload_cast_<>()(&Class::get_MasterOffset_s, py::const_));
+    py_class.def("get_off_diagonal_geometries", [](const Class &self, const py::array_t<double, py::array::f_style> &xt, const py::array_t<double, py::array::f_style> &xs) {
+        int new_nc, new_nr, spatial_dimension;
+        self.get_off_diagonal_size(new_nr, new_nc);
+        spatial_dimension = self.get_target_cluster()->get_space_dim();
+        py::array_t<double, py::array::f_style> new_xs(std::array<long int, 2>{spatial_dimension, new_nc}), new_xt(std::array<long int, 2>{spatial_dimension, new_nr});
+        self.get_off_diagonal_geometries(xt.data(), xs.data(), new_xt.mutable_data(), new_xs.mutable_data());
+        return std::tuple<py::array_t<double, py::array::f_style>, py::array_t<double, py::array::f_style>>(new_xt, new_xs);
+    });
 
     // Linear algebra
     py_class.def("__mul__", [](const Class &self, std::vector<T> b) {
@@ -102,50 +114,9 @@ void declare_HMatrix(py::module &m, const std::string &baseclassName, const std:
     // Plot pattern
     py_class.def(
         "display", [](const Class &self, bool show = true) {
-            const std::vector<LowRankMatrix<T> *> &lrmats = self.get_MyFarFieldMats();
-            const std::vector<SubMatrix<T> *> &dmats      = self.get_MyNearFieldMats();
-
-            int nb        = dmats.size() + lrmats.size();
-            int sizeworld = self.get_sizeworld();
-            int rankworld = self.get_rankworld();
-
-            int nbworld[sizeworld];
-            MPI_Allgather(&nb, 1, MPI_INT, nbworld, 1, MPI_INT, self.get_comm());
-            int nbg = 0;
-            for (int i = 0; i < sizeworld; i++) {
-                nbg += nbworld[i];
-            }
-
-            std::vector<int> buf(5 * nbg, 0);
-
-            for (int i = 0; i < dmats.size(); i++) {
-                const SubMatrix<T> &l = *(dmats[i]);
-                buf[5 * i]            = l.get_offset_i();
-                buf[5 * i + 1]        = l.nb_rows();
-                buf[5 * i + 2]        = l.get_offset_j();
-                buf[5 * i + 3]        = l.nb_cols();
-                buf[5 * i + 4]        = -1;
-            }
-
-            for (int i = 0; i < lrmats.size(); i++) {
-                const LowRankMatrix<T> &l       = *(lrmats[i]);
-                buf[5 * (dmats.size() + i)]     = l.get_offset_i();
-                buf[5 * (dmats.size() + i) + 1] = l.nb_rows();
-                buf[5 * (dmats.size() + i) + 2] = l.get_offset_j();
-                buf[5 * (dmats.size() + i) + 3] = l.nb_cols();
-                buf[5 * (dmats.size() + i) + 4] = l.rank_of();
-            }
-
-            int displs[sizeworld];
-            int recvcounts[sizeworld];
-            displs[0] = 0;
-
-            for (int i = 0; i < sizeworld; i++) {
-                recvcounts[i] = 5 * nbworld[i];
-                if (i > 0)
-                    displs[i] = displs[i - 1] + recvcounts[i - 1];
-            }
-            MPI_Gatherv(rankworld == 0 ? MPI_IN_PLACE : buf.data(), recvcounts[rankworld], MPI_INT, buf.data(), recvcounts, displs, MPI_INT, 0, self.get_comm());
+            int rankworld        = self.get_rankworld();
+            std::vector<int> buf = self.get_output();
+            int nbg              = buf.size() / 5;
 
             if (rankworld == 0) {
                 // Import
@@ -158,6 +129,7 @@ void declare_HMatrix(py::module &m, const std::string &baseclassName, const std:
                 int nr = self.nb_rows();
                 int nc = self.nb_cols();
                 py::array_t<int> matrix({nr, nc});
+                matrix.attr("fill")(0);
                 py::array_t<bool> mask_matrix({nr, nc});
                 mask_matrix.attr("fill")(false);
 
@@ -246,7 +218,7 @@ void declare_HMatrix(py::module &m, const std::string &baseclassName, const std:
                 // Permuted geometric points
                 for (int i = 0; i < size; ++i) {
                     for (int p = 0; p < space_dim; p++) {
-                        output[i + size * p] = points_target.at(p, root->get_perm(i));
+                        output[i + size * p] = points_target.at(p, root->get_global_perm(i));
                     }
                 }
 
diff --git a/src/htool/lrmat_generator.hpp b/src/htool/lrmat_generator.hpp
index c34a07d..d097dd7 100644
--- a/src/htool/lrmat_generator.hpp
+++ b/src/htool/lrmat_generator.hpp
@@ -15,8 +15,9 @@ using namespace htool;
 template <typename T>
 class VirtualLowRankGeneratorCpp : public VirtualLowRankGenerator<T> {
 
-    py::array_t<T, py::array::f_style> mat_U, mat_V;
     int rank;
+    mutable std::vector<py::array_t<T, py::array::f_style>> mats_U; // owned by Python
+    mutable std::vector<py::array_t<T, py::array::f_style>> mats_V; // owned by Python
 
   public:
     using VirtualLowRankGenerator<T>::VirtualLowRankGenerator;
@@ -24,18 +25,18 @@ class VirtualLowRankGeneratorCpp : public VirtualLowRankGenerator<T> {
     void copy_low_rank_approximation(double epsilon, int M, int N, const int *const rows, const int *const cols, int &rank0, T **U, T **V, const VirtualGenerator<T> &A, const VirtualCluster &t, const double *const xt, const VirtualCluster &s, const double *const xs) const override {
 
         build_low_rank_approximation(epsilon, rank0, A, std::vector<int>(rows, rows + M), std::vector<int>(cols, cols + N));
-        *U    = new T[M * rank];
-        *V    = new T[N * rank];
+        *U    = mats_U.back().mutable_data();
+        *V    = mats_V.back().mutable_data();
         rank0 = rank;
-        std::copy_n(mat_U.data(), mat_U.size(), *U);
-        std::copy_n(mat_V.data(), mat_V.size(), *V);
     }
 
+    bool is_htool_owning_data() const override { return false; }
+
     // lcov does not see it because of trampoline I assume
     virtual void build_low_rank_approximation(double epsilon, int rank, const VirtualGenerator<T> &A, const std::vector<int> &J, const std::vector<int> &K) const = 0; // LCOV_EXCL_LINE
 
-    void set_U(py::array_t<T, py::array::f_style> U0) { mat_U = U0; }
-    void set_V(py::array_t<T, py::array::f_style> V0) { mat_V = V0; }
+    void set_U(py::array_t<T, py::array::f_style> U0) { mats_U.push_back(U0); }
+    void set_V(py::array_t<T, py::array::f_style> V0) { mats_V.push_back(V0); }
     void set_rank(int rank0) { rank = rank0; }
 };
 
diff --git a/src/htool/main.cpp b/src/htool/main.cpp
index 2d907c0..795cf00 100644
--- a/src/htool/main.cpp
+++ b/src/htool/main.cpp
@@ -4,6 +4,7 @@
 #include "hmatrix.hpp"
 #include "lrmat_generator.hpp"
 #include "matrix.hpp"
+#include "off_diagonal_approximation.hpp"
 #include "wrapper_mpi.hpp"
 
 PYBIND11_MODULE(Htool, m) {
@@ -27,6 +28,9 @@ PYBIND11_MODULE(Htool, m) {
     declare_HMatrix<double>(m, "HMatrixVirtual", "HMatrix");
     declare_HMatrix<std::complex<double>>(m, "ComplexHMatrixVirtual", "ComplexHMatrix");
 
+    declare_VirtualOffDiagonalApproximation<double>(m, "VirtualOffDiagonalApproximation", "CustomOffDiagonalApproximation", "HMatrixOffDiagonalApproximation");
+    declare_VirtualOffDiagonalApproximation<std::complex<double>>(m, "ComplexVirtualOffDiagonalApproximation", "ComplexCustomOffDiagonalApproximation", "ComplexHMatrixOffDiagonalApproximation");
+
     declare_DDM<double>(m, "DDM");
     declare_DDM<std::complex<double>>(m, "ComplexDDM");
 }
diff --git a/src/htool/off_diagonal_approximation.hpp b/src/htool/off_diagonal_approximation.hpp
new file mode 100644
index 0000000..c68c19a
--- /dev/null
+++ b/src/htool/off_diagonal_approximation.hpp
@@ -0,0 +1,189 @@
+#ifndef HTOOL_OFF_DIAGONAL_APPROXIMATION_CPP
+#define HTOOL_OFF_DIAGONAL_APPROXIMATION_CPP
+
+#include <pybind11/numpy.h>
+#include <pybind11/operators.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <htool/htool.hpp>
+#include <htool/types/off_diagonal_approximation_with_hmatrix.hpp>
+
+template <typename T>
+class VirtualOffDiagonalApproximationCpp : public VirtualOffDiagonalApproximation<T> {
+    int nc, nr;
+
+  public:
+    using VirtualOffDiagonalApproximation<T>::VirtualOffDiagonalApproximation;
+
+    VirtualOffDiagonalApproximationCpp(const VirtualHMatrix<T> &HA) {
+        HA.get_off_diagonal_size(nr, nc);
+    }
+
+    void mvprod_global_to_local(const T *const in, T *const out, const int &mu) override {
+
+        py::array_t<T, py::array::c_style> in_py(std::array<long int, 2>{nc, mu}, in, py::capsule(in));
+        py::array_t<T, py::array::c_style> out_py(std::array<long int, 2>{nr, mu}, out, py::capsule(out));
+
+        mat_mat_prod_global_to_local(in_py, out_py);
+        // for (int i = 0; i < nr; i++) {
+        //     for (int j = 0; j < mu; j++) {
+        //         std::cout << out[i * mu + j] << " " << out_py.at(i, j) << " ";
+        //     }
+        //     std::cout << "\n";
+        // }
+    }
+
+    void mvprod_subrhs_to_local(const T *const in, T *const out, const int &mu, const int &offset, const int &size) override {
+        py::array_t<T, py::array::c_style> in_py(std::array<long int, 2>{size, mu}, in, py::capsule(in));
+        py::array_t<T, py::array::c_style> out_py(std::array<long int, 2>{nr, mu}, out, py::capsule(out));
+
+        mat_mat_prod_sub_rhs_to_local(in_py, out_py, mu, offset, size);
+    }
+
+    // lcov does not see it because of trampoline I assume
+    virtual void mat_mat_prod_global_to_local(const py::array_t<T, py::array::c_style> &in, py::array_t<T, py::array::c_style> &out) const = 0; // LCOV_EXCL_LINE
+
+    virtual void mat_mat_prod_sub_rhs_to_local(const py::array_t<T, py::array::c_style> &in, py::array_t<T, py::array::c_style> &out, int mu, int offset, int size) const {
+        std::vector<T> in_global(nc * mu, 0);
+        std::copy_n(in.data(), size * mu, in_global.data() + offset * mu);
+        py::array_t<T, py::array::c_style> in_global_pyarray({nc, mu}, in_global.data());
+
+        this->mat_mat_prod_global_to_local(in_global_pyarray, out);
+    }
+};
+
+template <typename T>
+class PyVirtualOffDiagonalApproximation : public VirtualOffDiagonalApproximationCpp<T> {
+  public:
+    using VirtualOffDiagonalApproximationCpp<T>::VirtualOffDiagonalApproximationCpp;
+    // PyVirtualGenerator(int nr0, int nc0): IMatrix<T>(nr0,nc0){}
+
+    /* Trampoline (need one for each virtual function) */
+    virtual void mat_mat_prod_global_to_local(const py::array_t<T, py::array::c_style> &in, py::array_t<T, py::array::c_style> &out) const override {
+        PYBIND11_OVERRIDE_PURE(
+            void,                                  /* Return type */
+            VirtualOffDiagonalApproximationCpp<T>, /* Parent class */
+            mat_mat_prod_global_to_local,          /* Name of function in C++ (must match Python name) */
+            in,
+            out /* Argument(s) */
+        );
+    }
+
+    /* Trampoline (need one for each virtual function) */
+    virtual void mat_mat_prod_sub_rhs_to_local(const py::array_t<T, py::array::c_style> &in, py::array_t<T, py::array::c_style> &out, int mu, int offset, int size) const override {
+        PYBIND11_OVERRIDE(
+            void,                                  /* Return type */
+            VirtualOffDiagonalApproximationCpp<T>, /* Parent class */
+            mat_mat_prod_sub_rhs_to_local,         /* Name of function in C++ (must match Python name) */
+            in,
+            out,
+            mu,
+            offset,
+            size /* Argument(s) */
+        );
+    }
+};
+
+template <typename T>
+void declare_VirtualOffDiagonalApproximation(py::module &m, const std::string &BaseClassName, const std::string &VirtualClassName, const std::string &ClassName) {
+    // Not to be used, but we need to declare it to use it as an argument
+    using VirtualBaseClass = VirtualOffDiagonalApproximation<T>;
+    py::class_<VirtualBaseClass, std::shared_ptr<VirtualBaseClass>> py_base_class(m, BaseClassName.c_str());
+
+    // Virtual class that the user can use
+    using VirtualClass = VirtualOffDiagonalApproximationCpp<T>;
+    py::class_<VirtualClass, std::shared_ptr<VirtualClass>, PyVirtualOffDiagonalApproximation<T>, VirtualBaseClass> py_virtual_class(m, VirtualClassName.c_str());
+    py_virtual_class.def(py::init<const VirtualHMatrix<T> &>());
+    py_virtual_class.def("mat_mat_prod_global_to_local", &VirtualClass::mat_mat_prod_global_to_local);
+
+    // Possible off diagonal approximation defined in htool
+    using Class = OffDiagonalApproximationWithHMatrix<T>;
+    py::class_<Class, std::shared_ptr<Class>, VirtualBaseClass> py_class(m, ClassName.c_str());
+    py_class.def(py::init<VirtualHMatrix<T> *, std::shared_ptr<VirtualCluster>, std::shared_ptr<VirtualCluster>>());
+    py_class.def("build", [](Class &self, VirtualGeneratorCpp<T> &mat, const py::array_t<double, py::array::f_style> &x, const py::array_t<double, py::array::f_style> &y) {
+        self.build(mat, x.data(), y.data());
+    });
+    py_class.def("set_compression", [](Class &self, std::shared_ptr<VirtualLowRankGeneratorCpp<T>> mat) {
+        self.set_compression(mat);
+    });
+    py_class.def("print_infos", [](Class &self) {
+        self.get_HMatrix()->print_infos();
+    });
+    // Plot pattern
+    py_class.def(
+        "display", [](const Class &self, bool show = true) {
+            std::vector<int> buf = self.get_output();
+            int nbg              = buf.size() / 5;
+
+            // Import
+            py::object plt     = py::module::import("matplotlib.pyplot");
+            py::object patches = py::module::import("matplotlib.patches");
+            py::object colors  = py::module::import("matplotlib.colors");
+            py::object numpy   = py::module::import("numpy");
+
+            // First Data
+            int nr = self.nb_rows();
+            int nc = self.nb_cols();
+            py::array_t<int> matrix({nr, nc});
+            matrix.attr("fill")(0);
+            py::array_t<bool> mask_matrix({nr, nc});
+            mask_matrix.attr("fill")(false);
+
+            // Figure
+            py::tuple sublots_output = plt.attr("subplots")(1, 1);
+            py::object fig           = sublots_output[0];
+            py::object axes          = sublots_output[1];
+            // axes.attr()
+
+            // Issue: there a shift of one pixel along the y-axis...
+            // int shift = axes.transData.transform([(0,0), (1,1)])
+            // shift = shift[1,1] - shift[0,1]  # 1 unit in display coords
+            int shift = 0;
+
+            int max_rank = 0;
+            for (int p = 0; p < nbg; p++) {
+                int i_row  = buf[5 * p];
+                int nb_row = buf[5 * p + 1];
+                int i_col  = buf[5 * p + 2];
+                int nb_col = buf[5 * p + 3];
+                int rank   = buf[5 * p + 4];
+
+                if (rank > max_rank) {
+                    max_rank = rank;
+                }
+                for (int i = 0; i < nb_row; i++) {
+                    for (int j = 0; j < nb_col; j++) {
+                        matrix.mutable_at(i_row + i, i_col + j) = rank;
+                        if (rank == -1) {
+                            mask_matrix.mutable_at(i_row + i, i_col + j) = true;
+                        }
+                    }
+                }
+
+                py::object rect = patches.attr("Rectangle")(py::make_tuple(i_col - 0.5, i_row - 0.5 + shift), nb_col, nb_row, "linewidth"_a = 0.75, "edgecolor"_a = 'k', "facecolor"_a = "none");
+                axes.attr("add_patch")(rect);
+
+                if (rank >= 0 && nb_col / double(nc) > 0.05 && nb_row / double(nc) > 0.05) {
+                    axes.attr("annotate")(rank, py::make_tuple(i_col + nb_col / 2., i_row + nb_row / 2.), "color"_a = "white", "size"_a = 10, "va"_a = "center", "ha"_a = "center");
+                }
+            }
+
+            // Colormap
+            py::object cmap     = plt.attr("get_cmap")("YlGn");
+            py::object new_cmap = colors.attr("LinearSegmentedColormap").attr("from_list")("trunc(YlGn,0.4,1)", cmap(numpy.attr("linspace")(0.4, 1, 100)));
+
+            // Plot
+            py::object masked_matrix = numpy.attr("ma").attr("array")(matrix, "mask"_a = mask_matrix);
+            new_cmap.attr("set_bad")("color"_a = "red");
+
+            plt.attr("imshow")(masked_matrix, "cmap"_a = new_cmap, "vmin"_a = 0, "vmax"_a = 10);
+            plt.attr("draw")();
+            if (show) {
+                plt.attr("show")(); // LCOV_EXCL_LINE
+            }
+        },
+        py::arg("show") = true);
+}
+
+#endif
diff --git a/tests/test_ddm_solver.py b/tests/test_ddm_solver.py
index c2bdcdd..d80c34f 100644
--- a/tests/test_ddm_solver.py
+++ b/tests/test_ddm_solver.py
@@ -21,14 +21,31 @@ def build_submatrix(self, J, K, mat):
             for k in range(0, len(K)):
                 mat[j, k] = self.get_coef(J[j], K[k])
 
+    def inplace_matmat(self, X, Y):
+        for i in range(0, self.nb_rows()):
+            for j in range(0, X.shape[1]):
+                for k in range(0, self.nb_cols()):
+                    Y[i, j] += self.get_coef(i, k)*X[k, j]
 
-@pytest.mark.parametrize("mu,Symmetry", [
-    (1, 'S'),
-    (10, 'S'),
-    (1, 'N'),
-    (10, 'N'),
+
+class CustomOffDiagonalApproximation(Htool.ComplexCustomOffDiagonalApproximation):
+    def __init__(self, HA, subA):
+        Htool.ComplexCustomOffDiagonalApproximation.__init__(self, HA)
+        self.off_diagonal_generator = GeneratorCoef(subA)
+
+    def mat_mat_prod_global_to_local(self, x, y):
+        self.off_diagonal_generator.inplace_matmat(x, y)
+
+
+@pytest.mark.parametrize("mu,Symmetry,OffDiagonalApproximation", [
+    (1, 'S', None),
+    (10, 'S', None),
+    (1, 'N', None),
+    (10, 'N', None),
+    (1, 'N', "Dense"),
+    (1, 'S', "Dense"),
 ])
-def test_ddm_solver(mu, Symmetry):
+def test_ddm_solver(mu, Symmetry, OffDiagonalApproximation):
 
     # MPI
     comm = MPI.COMM_WORLD
@@ -95,6 +112,32 @@ def test_ddm_solver(mu, Symmetry):
     # Hmatrix
     generator = GeneratorCoef(A)
     hmat = Htool.ComplexHMatrix(cluster, cluster, tol, eta, Symmetry, UPLO)
+
+    if OffDiagonalApproximation is not None:
+        if OffDiagonalApproximation == "Dense":
+            MasterOffset = cluster.get_masteroffset()
+            A_perm = np.zeros((n, n), dtype=np.dtype('complex128'))
+            for i in range(0, n):
+                for j in range(0, n):
+                    A_perm[i, j] = A[cluster.get_global_perm(
+                        i), cluster.get_global_perm(j)]
+
+            rows = np.arange(MasterOffset[0, rank],
+                             MasterOffset[0, rank]+MasterOffset[1, rank], 1, dtype=int)
+            cols = np.zeros(0, dtype=int)
+            for i in range(0, size):
+                if i != rank:
+                    cols = np.concatenate((cols, np.arange(
+                        MasterOffset[0, i],  MasterOffset[0, i]+MasterOffset[1, i], 1, dtype=int)))
+            if rank == 1:
+                print(rows)
+            submatrix = A_perm[np.ix_(rows, cols)]
+            off_diagonal_approximation = CustomOffDiagonalApproximation(
+                hmat, submatrix)
+
+        hmat.set_off_diagonal_approximation(
+            off_diagonal_approximation)
+
     hmat.build(generator, p)
 
     # Global vectors
diff --git a/tests/test_hmatrix.py b/tests/test_hmatrix.py
index 9aef925..1dffb62 100644
--- a/tests/test_hmatrix.py
+++ b/tests/test_hmatrix.py
@@ -38,6 +38,12 @@ def matmat(self, X):
                     Y[i, j] += self.get_coef(i, k)*X[k, j]
         return Y
 
+    def inplace_matmat(self, X, Y):
+        for i in range(0, self.nb_rows()):
+            for j in range(0, X.shape[1]):
+                for k in range(0, self.nb_cols()):
+                    Y[i, j] += self.get_coef(i, k)*X[k, j]
+
 
 class CustomSVD(Htool.CustomLowRankGenerator):
 
@@ -58,6 +64,16 @@ def build_low_rank_approximation(self, epsilon, rank, A, J, K):
         self.set_rank(count)
 
 
+class CustomOffDiagonalApproximation(Htool.CustomOffDiagonalApproximation):
+    def __init__(self, HA, off_diagonal_points_target, off_diagonal_points_source):
+        Htool.CustomOffDiagonalApproximation.__init__(self, HA)
+        self.off_diagonal_generator = GeneratorSubMatrix(
+            off_diagonal_points_target, off_diagonal_points_source)
+
+    def mat_mat_prod_global_to_local(self, x, y):
+        self.off_diagonal_generator.inplace_matmat(x, y)
+
+
 class DenseBlockGenerator(Htool.CustomDenseBlocksGenerator):
     def __init__(self, points_target, points_source):
         super().__init__()
@@ -76,20 +92,36 @@ def build_dense_blocks(self, rows, cols, blocks):
                         self.points_target[:, rows[i][j]] - self.points_source[:, cols[i][k]]))
 
 
-@pytest.mark.parametrize("NbRows,NbCols,Symmetric,UPLO,Compression,Delay", [
-    (500, 500, 'S', 'L', None, False),
-    (500, 500, 'S', 'U', None, False),
-    (500, 500, 'N', 'N', None, False),
-    (500, 250, 'N', 'N', None, False),
-    (500, 500, 'S', 'L', None, True),
-    (500, 500, 'S', 'U', None, True),
-    (500, 500, 'N', 'N', None, True),
-    (500, 500, 'S', 'L', "Custom", True),
-    (500, 500, 'S', 'U', "Custom", True),
-    (500, 500, 'N', 'N', "Custom", True),
-    (500, 250, 'N', 'N', "Custom", True),
+@pytest.mark.parametrize("NbRows,NbCols,Symmetric,UPLO,Compression,Delay,OffDiagonalApproximation", [
+    (500, 500, 'S', 'L', None, False, None),
+    (500, 500, 'S', 'U', None, False, None),
+    (500, 500, 'N', 'N', None, False, None),
+    (500, 250, 'N', 'N', None, False, None),
+    (500, 500, 'S', 'L', None, True, None),
+    (500, 500, 'S', 'U', None, True, None),
+    (500, 500, 'N', 'N', None, True, None),
+    (500, 500, 'S', 'L', "Custom", True, None),
+    (500, 500, 'S', 'U', "Custom", True, None),
+    (500, 500, 'N', 'N', "Custom", True, None),
+    (500, 250, 'N', 'N', "Custom", True, None),
+    (500, 500, 'S', 'L', None, False, "Dense"),
+    (500, 500, 'S', 'U', None, False, "Dense"),
+    (500, 500, 'N', 'N', None, False, "Dense"),
+    (500, 250, 'N', 'N', None, False, "Dense"),
+    (500, 500, 'S', 'L', None, False, "HMatrix"),
+    (500, 500, 'S', 'U', None, False, "HMatrix"),
+    (500, 500, 'N', 'N', None, False, "HMatrix"),
+    (500, 250, 'N', 'N', None, False, "HMatrix"),
+    (500, 500, 'S', 'L', "Custom", False, "Dense"),
+    (500, 500, 'S', 'U', "Custom", False, "Dense"),
+    (500, 500, 'N', 'N', "Custom", False, "Dense"),
+    (500, 250, 'N', 'N', "Custom", False, "Dense"),
+    (500, 500, 'S', 'L', "Custom", False, "HMatrix"),
+    (500, 500, 'S', 'U', "Custom", False, "HMatrix"),
+    (500, 500, 'N', 'N', "Custom", False, "HMatrix"),
+    (500, 250, 'N', 'N', "Custom", False, "HMatrix"),
 ])
-def test_HMatrix(NbRows, NbCols, Symmetric, UPLO, Compression, Delay):
+def test_HMatrix(NbRows, NbCols, Symmetric, UPLO, Compression, Delay, OffDiagonalApproximation):
 
     # Random geometry
     np.random.seed(0)
@@ -127,6 +159,46 @@ def test_HMatrix(NbRows, NbCols, Symmetric, UPLO, Compression, Delay):
     HMatrix = Htool.HMatrix(cluster_target, cluster_source,
                             epsilon, eta, Symmetric, UPLO)
 
+    if OffDiagonalApproximation is not None:
+        # Geometry
+        off_diagonal_points_target, off_diagonal_points_source = HMatrix.get_off_diagonal_geometries(
+            points_target, points_source)
+
+        if OffDiagonalApproximation == "Dense":
+            off_diagonal_approximation = CustomOffDiagonalApproximation(HMatrix,
+                                                                        off_diagonal_points_target, off_diagonal_points_source)
+
+        if OffDiagonalApproximation == "HMatrix":
+            # Clustering
+            off_diagonal_cluster_target = Htool.PCARegularClustering(3)
+            off_diagonal_cluster_source = Htool.PCARegularClustering(3)
+            off_diagonal_cluster_target.set_minclustersize(minclustersize)
+            off_diagonal_cluster_source.set_minclustersize(minclustersize)
+            off_diagonal_cluster_target.build(
+                off_diagonal_points_target.shape[1], off_diagonal_points_target, 2, mpi4py.MPI.COMM_SELF)
+            off_diagonal_cluster_source.build(
+                off_diagonal_points_source.shape[1], off_diagonal_points_source, 2, mpi4py.MPI.COMM_SELF)
+
+            # Off diagonal generator
+            off_diagonal_generator = GeneratorSubMatrix(
+                off_diagonal_points_target, off_diagonal_points_source)
+
+            # Off diagonal HMatrix
+            off_diagonal_approximation = Htool.HMatrixOffDiagonalApproximation(
+                HMatrix, off_diagonal_cluster_target, off_diagonal_cluster_source)
+            if Compression is not None:
+                compression = CustomSVD()
+                off_diagonal_approximation.set_compression(compression)
+
+            off_diagonal_approximation.build(off_diagonal_generator,
+                                             off_diagonal_points_target, off_diagonal_points_source)
+
+            off_diagonal_approximation.print_infos()
+            off_diagonal_approximation.display(False)
+
+        HMatrix.set_off_diagonal_approximation(
+            off_diagonal_approximation)
+
     if Compression is not None:
         compression = CustomSVD()
         HMatrix.set_compression(compression)