scikit-learn · lesteve · Feb 12, 2024 · Jan 31, 2024 · Jan 31, 2024 · Jan 31, 2024
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
@@ -93,10 +93,11 @@ Changelog
 ...................
 
 - |Fix| :class:`tree.DecisionTreeClassifier` and
-  :class:`tree.DecisionTreeRegressor` are handling missing values properly. The internal
-  criterion was not initialize when no missing values were present in the data, leading
-  to potentially wrong criterion values.
-  :pr:`28295` by :user:`Guillaume Lemaitre <glemaitre>`.
+  :class:`tree.DecisionTreeRegressor` are handling missing values properly for ``squared_error``
+  and ``friedman_mse`` criterion. The internal criterion was not initialize when no missing
+  values were present in the data, and did not always account for missing values for squared
+  error based criterion, leading to potentially wrong criterion values.
+  :pr:`28295` by :user:`Guillaume Lemaitre <glemaitre>` and :pr:`28327` by :user:`Adam Li <adam2392>`.
 
 :mod:`sklearn.utils`
 ....................

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
@@ -1150,6 +1150,11 @@ cdef class MSE(RegressionCriterion):
         cdef intp_t k
         cdef float64_t w = 1.0
 
+        # The missing samples are assumed to be in
+        # self.sample_indices[-self.n_missing:] that is
+        # self.sample_indices[end_non_missing:self.end].
+        cdef intp_t end_non_missing = self.end - self.n_missing
+
         for p in range(start, pos):
             i = sample_indices[p]
 
@@ -1160,6 +1165,19 @@ cdef class MSE(RegressionCriterion):
                 y_ik = self.y[i, k]
                 sq_sum_left += w * y_ik * y_ik
 
+        # Account for missing values that may be on the left node.
+        # If so, then these samples are still not included in the
+        # sq_sum_left because they are in samples[end_non_missing:end].
+        if self.missing_go_to_left:
+            for p in range(end_non_missing, self.end):
+                i = sample_indices[p]
+                if sample_weight is not None:
+                    w = sample_weight[i]
+
+                for k in range(self.n_outputs):
+                    y_ik = self.y[i, k]
+                    sq_sum_left += w * y_ik * y_ik
+
         sq_sum_right = self.sq_sum_total - sq_sum_left
 
         impurity_left[0] = sq_sum_left / self.weighted_n_left

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
@@ -539,6 +539,10 @@ cdef inline int node_split_best(
             best_split.impurity_right
         )
 
+        # Note: this should always be called at the very end because it will
+        # move samples around, thereby affecting the criterion.
+        # This affects the computation of the children impurity, which affects
+        # the computation of the next node.
         shift_missing_values_to_left_if_required(&best_split, samples, end)
 
     # Respect invariant for constant features: the original order of

diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
@@ -2617,7 +2617,19 @@ def test_deterministic_pickle():
     assert pickle1 == pickle2
 
 
-def test_regression_tree_missing_values_toy():
+@pytest.mark.parametrize(
+    "X",
+    [
+        # missing values will go left for greedy splits
+        np.array([np.nan, 2, np.nan, 4, 5, 6]),
+        np.array([np.nan, np.nan, 3, 4, 5, 6]),
+        # missing values will go right for greedy splits
+        np.array([1, 2, 3, 4, np.nan, np.nan]),
+        np.array([1, 2, 3, np.nan, 6, np.nan]),
+    ],
+)
+@pytest.mark.parametrize("criterion", ["squared_error", "friedman_mse"])
+def test_regression_tree_missing_values_toy(X, criterion):
     """Check that we properly handle missing values in regression trees using a toy
     dataset.
 
@@ -2631,12 +2643,11 @@ def test_regression_tree_missing_values_toy():
     https://github.com/scikit-learn/scikit-learn/issues/28254
     """
 
-    # With this dataset, the missing values will always be sent to the left child
-    # at the first split. The leaf will be pure.
-    X = np.array([np.nan, np.nan, 3, 4, 5, 6]).reshape(-1, 1)
+    # With certain datasets, the missing values will always be sent to the left,
+    # or right child at the first split. The leaf will be pure.
     y = np.arange(6)
 
-    tree = DecisionTreeRegressor(random_state=0).fit(X, y)
+    tree = DecisionTreeRegressor(criterion=criterion, random_state=0).fit(X, y)
     assert all(tree.tree_.impurity >= 0)  # MSE should always be positive
 
     # Find the leaves with a single sample where the MSE should be 0