diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 0d9ade659fef..30ea6cd51674 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -470,6 +470,14 @@ Learning Control Parameters
 
       -  ``intermediate``, a `more advanced method <https://github.com/microsoft/LightGBM/files/3457826/PR-monotone-constraints-report.pdf>`__, which may slow the library very slightly. However, this method is much less constraining than the basic method and should significantly improve the results
 
+-  ``monotone_penalty`` :raw-html:`<a id="monotone_penalty" title="Permalink to this parameter" href="#monotone_penalty">&#x1F517;&#xFE0E;</a>`, default = ``0.0``, type = double, aliases: ``monotone_splits_penalty``, ``ms_penalty``, ``mc_penalty``, constraints: ``monotone_penalty >= 0.0``
+
+   -  used only if ``monotone_constraints`` is set
+
+   -  `monotone penalty <https://github.com/microsoft/LightGBM/files/3457826/PR-monotone-constraints-report.pdf>`__: a penalization parameter X forbids any monotone splits on the first X (rounded down) level(s) of the tree. The penalty applied to monotone splits on a given depth is a continuous, increasing function the penalization parameter
+
+   -  if ``0.0`` (the default), no penalization is applied
+
 -  ``feature_contri`` :raw-html:`<a id="feature_contri" title="Permalink to this parameter" href="#feature_contri">&#x1F517;&#xFE0E;</a>`, default = ``None``, type = multi-double, aliases: ``feature_contrib``, ``fc``, ``fp``, ``feature_penalty``
 
    -  used to control feature's split gain, will use ``gain[i] = max(0, feature_contri[i]) * gain[i]`` to replace the split gain of i-th feature
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 057f56e99491..01a2061f8efb 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -447,6 +447,13 @@ struct Config {
   // descl2 = ``intermediate``, a `more advanced method <https://github.com/microsoft/LightGBM/files/3457826/PR-monotone-constraints-report.pdf>`__, which may slow the library very slightly. However, this method is much less constraining than the basic method and should significantly improve the results
   std::string monotone_constraints_method = "basic";
 
+  // alias = monotone_splits_penalty, ms_penalty, mc_penalty
+  // check = >=0.0
+  // desc = used only if ``monotone_constraints`` is set
+  // desc = `monotone penalty <https://github.com/microsoft/LightGBM/files/3457826/PR-monotone-constraints-report.pdf>`__: a penalization parameter X forbids any monotone splits on the first X (rounded down) level(s) of the tree. The penalty applied to monotone splits on a given depth is a continuous, increasing function the penalization parameter
+  // desc = if ``0.0`` (the default), no penalization is applied
+  double monotone_penalty = 0.0;
+
   // type = multi-double
   // alias = feature_contrib, fc, fp, feature_penalty
   // default = None
diff --git a/src/io/config.cpp b/src/io/config.cpp
index 0cf1d3c8bf21..a6f475e89d83 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -328,6 +328,9 @@ void Config::CheckParamConflict() {
     Log::Warning("Cannot use \"intermediate\" monotone constraints with feature fraction different from 1, auto set monotone constraints to \"basic\" method.");
     monotone_constraints_method = "basic";
   }
+  if (max_depth > 0 && monotone_penalty >= max_depth) {
+    Log::Warning("Monotone penalty greater than tree depth. Monotone features won't be used.");
+  }
 }
 
 std::string Config::ToString() const {
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index 59cc62a5d375..b2204affb4df 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -87,6 +87,9 @@ const std::unordered_map<std::string, std::string>& Config::alias_table() {
   {"monotone_constraint", "monotone_constraints"},
   {"monotone_constraining_method", "monotone_constraints_method"},
   {"mc_method", "monotone_constraints_method"},
+  {"monotone_splits_penalty", "monotone_penalty"},
+  {"ms_penalty", "monotone_penalty"},
+  {"mc_penalty", "monotone_penalty"},
   {"feature_contrib", "feature_contri"},
   {"fc", "feature_contri"},
   {"fp", "feature_contri"},
@@ -218,6 +221,7 @@ const std::unordered_set<std::string>& Config::parameter_set() {
   "top_k",
   "monotone_constraints",
   "monotone_constraints_method",
+  "monotone_penalty",
   "feature_contri",
   "forcedsplits_filename",
   "refit_decay_rate",
@@ -419,6 +423,9 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
   GetString(params, "monotone_constraints_method", &monotone_constraints_method);
 
+  GetDouble(params, "monotone_penalty", &monotone_penalty);
+  CHECK_GE(monotone_penalty, 0.0);
+
   if (GetString(params, "feature_contri", &tmp_str)) {
     feature_contri = Common::StringToArray<double>(tmp_str, ',');
   }
@@ -639,6 +646,7 @@ std::string Config::SaveMembersToString() const {
   str_buf << "[top_k: " << top_k << "]\n";
   str_buf << "[monotone_constraints: " << Common::Join(Common::ArrayCast<int8_t, int>(monotone_constraints), ",") << "]\n";
   str_buf << "[monotone_constraints_method: " << monotone_constraints_method << "]\n";
+  str_buf << "[monotone_penalty: " << monotone_penalty << "]\n";
   str_buf << "[feature_contri: " << Common::Join(feature_contri, ",") << "]\n";
   str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n";
   str_buf << "[refit_decay_rate: " << refit_decay_rate << "]\n";
diff --git a/src/treelearner/monotone_constraints.hpp b/src/treelearner/monotone_constraints.hpp
index 4d804d7fbfa0..dcad0d6d3288 100644
--- a/src/treelearner/monotone_constraints.hpp
+++ b/src/treelearner/monotone_constraints.hpp
@@ -62,6 +62,24 @@ class LeafConstraintsBase {
       const std::vector<SplitInfo>& best_split_per_leaf) = 0;
 
   inline static LeafConstraintsBase* Create(const Config* config, int num_leaves);
+
+  double ComputeMonotoneSplitGainPenalty(int leaf_index, double penalization) {
+    int depth = tree_->leaf_depth(leaf_index);
+    if (penalization >= depth + 1.) {
+      return kEpsilon;
+    }
+    if (penalization <= 1.) {
+      return 1. - penalization / pow(2., depth) + kEpsilon;
+    }
+    return 1. - pow(2, penalization - 1. - depth) + kEpsilon;
+  }
+
+  void ShareTreePointer(const Tree* tree) {
+    tree_ = tree;
+  }
+
+ private:
+  const Tree* tree_;
 };
 
 class BasicLeafConstraints : public LeafConstraintsBase {
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index b7569d22c8e2..6c4390553efd 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -165,6 +165,8 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
 
   auto tree = std::unique_ptr<Tree>(new Tree(config_->num_leaves));
   auto tree_prt = tree.get();
+  constraints_->ShareTreePointer(tree_prt);
+
   // root leaf
   int left_leaf = 0;
   int cur_depth = 1;
@@ -692,6 +694,11 @@ void SerialTreeLearner::ComputeBestSplitForFeature(
         cegb_->DetlaGain(feature_index, real_fidx, leaf_splits->leaf_index(),
                          num_data, new_split);
   }
+  if (new_split.monotone_type != 0) {
+    double penalty = constraints_->ComputeMonotoneSplitGainPenalty(
+        leaf_splits->leaf_index(), config_->monotone_penalty);
+    new_split.gain *= penalty;
+  }
   if (new_split > *best_split) {
     *best_split = new_split;
   }
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 51be083a9f01..d4200649eb20 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -1036,7 +1036,7 @@ def generate_trainset_for_monotone_constraints_tests(self, x3_to_category=True):
         categorical_features = []
         if x3_to_category:
             categorical_features = [2]
-        trainset = lgb.Dataset(x, label=y, categorical_feature=categorical_features)
+        trainset = lgb.Dataset(x, label=y, categorical_feature=categorical_features, free_raw_data=False)
         return trainset
 
     def test_monotone_constraints(self):
@@ -1071,8 +1071,8 @@ def is_correctly_constrained(learner, x3_to_category=True):
             return True
 
         for test_with_categorical_variable in [True, False]:
+            trainset = self.generate_trainset_for_monotone_constraints_tests(test_with_categorical_variable)
             for monotone_constraints_method in ["basic", "intermediate"]:
-                trainset = self.generate_trainset_for_monotone_constraints_tests(test_with_categorical_variable)
                 params = {
                     'min_data': 20,
                     'num_leaves': 20,
@@ -1083,6 +1083,76 @@ def is_correctly_constrained(learner, x3_to_category=True):
                 constrained_model = lgb.train(params, trainset)
                 self.assertTrue(is_correctly_constrained(constrained_model, test_with_categorical_variable))
 
+    def test_monotone_penalty(self):
+        def are_first_splits_non_monotone(tree, n, monotone_constraints):
+            if n <= 0:
+                return True
+            if "leaf_value" in tree:
+                return True
+            if monotone_constraints[tree["split_feature"]] != 0:
+                return False
+            return (are_first_splits_non_monotone(tree["left_child"], n - 1, monotone_constraints)
+                    and are_first_splits_non_monotone(tree["right_child"], n - 1, monotone_constraints))
+
+        def are_there_monotone_splits(tree, monotone_constraints):
+            if "leaf_value" in tree:
+                return False
+            if monotone_constraints[tree["split_feature"]] != 0:
+                return True
+            return (are_there_monotone_splits(tree["left_child"], monotone_constraints)
+                    or are_there_monotone_splits(tree["right_child"], monotone_constraints))
+
+        max_depth = 5
+        monotone_constraints = [1, -1, 0]
+        penalization_parameter = 2.0
+        trainset = self.generate_trainset_for_monotone_constraints_tests(x3_to_category=False)
+        for monotone_constraints_method in ["basic", "intermediate"]:
+            params = {
+                'max_depth': max_depth,
+                'monotone_constraints': monotone_constraints,
+                'monotone_penalty': penalization_parameter,
+                "monotone_constraints_method": monotone_constraints_method,
+            }
+            constrained_model = lgb.train(params, trainset, 10)
+            dumped_model = constrained_model.dump_model()["tree_info"]
+            for tree in dumped_model:
+                self.assertTrue(are_first_splits_non_monotone(tree["tree_structure"], int(penalization_parameter),
+                                                              monotone_constraints))
+                self.assertTrue(are_there_monotone_splits(tree["tree_structure"], monotone_constraints))
+
+    # test if a penalty as high as the depth indeed prohibits all monotone splits
+    def test_monotone_penalty_max(self):
+        max_depth = 5
+        monotone_constraints = [1, -1, 0]
+        penalization_parameter = max_depth
+        trainset_constrained_model = self.generate_trainset_for_monotone_constraints_tests(x3_to_category=False)
+        x = trainset_constrained_model.data
+        y = trainset_constrained_model.label
+        x3_negatively_correlated_with_y = x[:, 2]
+        trainset_unconstrained_model = lgb.Dataset(x3_negatively_correlated_with_y.reshape(-1, 1), label=y)
+        params_constrained_model = {
+            'monotone_constraints': monotone_constraints,
+            'monotone_penalty': penalization_parameter,
+            "max_depth": max_depth,
+            "gpu_use_dp": True,
+        }
+        params_unconstrained_model = {
+            "max_depth": max_depth,
+            "gpu_use_dp": True,
+        }
+
+        unconstrained_model = lgb.train(params_unconstrained_model, trainset_unconstrained_model, 10)
+        unconstrained_model_predictions = unconstrained_model.\
+            predict(x3_negatively_correlated_with_y.reshape(-1, 1))
+
+        for monotone_constraints_method in ["basic", "intermediate"]:
+            params_constrained_model["monotone_constraints_method"] = monotone_constraints_method
+            # The penalization is so high that the first 2 features should not be used here
+            constrained_model = lgb.train(params_constrained_model, trainset_constrained_model, 10)
+
+            # Check that a very high penalization is the same as not using the features at all
+            np.testing.assert_array_equal(constrained_model.predict(x), unconstrained_model_predictions)
+
     def test_max_bin_by_feature(self):
         col1 = np.arange(0, 100)[:, np.newaxis]
         col2 = np.zeros((100, 1))