microsoft · jameslamb · Jul 3, 2024 · Feb 12, 2024 · Feb 13, 2024 · Feb 13, 2024
@@ -1043,6 +1043,12 @@ def predict(
             **predict_params,
         )
 
+    def get_feature_names_out(self) -> np.ndarray:
+        """:obj:`array` of shape = [n_features]: Get output features of fitted model."""
+        if not self.__sklearn_is_fitted__():
+            raise LGBMNotFittedError("Output features cannot be determined. Need to call fit beforehand.")
+        return self.feature_names_in_
+
     predict.__doc__ = _lgbmmodel_doc_predict.format(
         description="Return the predicted value for each sample.",
         X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
@@ -1144,11 +1150,23 @@ def feature_importances_(self) -> np.ndarray:
 
     @property
     def feature_name_(self) -> List[str]:
-        """:obj:`list` of shape = [n_features]: The names of features."""
+        """:obj:`list` of shape = [n_features]: The names of features.
+
+        .. note::
+
+            If input does not contain feature names, they will be added during fitting in the format ``Column_0``, ``Column_1``, ..., ``Column_N``.
+        """
         if not self.__sklearn_is_fitted__():
             raise LGBMNotFittedError("No feature_name found. Need to call fit beforehand.")
         return self._Booster.feature_name()  # type: ignore[union-attr]
 
+    @property
+    def feature_names_in_(self) -> np.ndarray:
+        """:obj:`array` of shape = [n_features]: scikit-learn compatible version of ``.feature_name_``."""
+        if not self.__sklearn_is_fitted__():
+            raise LGBMNotFittedError("No feature_names_in_ found. Need to call fit beforehand.")
+        return np.array(self.feature_name_)
+
 
 class LGBMRegressor(_LGBMRegressorBase, LGBMModel):
     """LightGBM regressor."""

@@ -1290,6 +1290,90 @@ def test_max_depth_warning_is_never_raised(capsys, estimator_class, max_depth):
     assert "Provided parameters constrain tree depth" not in capsys.readouterr().out
 
 
+def test_getting_feature_names_in_np_input():
+    # input is a numpy array, which doesn't have feature names. LightGBM adds
+    # feature names to the fitted model, which is inconsistent with sklearn's behavior
+    X, y = load_digits(n_class=2, return_X_y=True)
+    est = lgb.LGBMModel(n_estimators=5, objective="binary")
+    clf = lgb.LGBMClassifier(n_estimators=5)
+    reg = lgb.LGBMRegressor(n_estimators=5)
+    rnk = lgb.LGBMRanker(n_estimators=5)
+    models = (est, clf, reg, rnk)
+    group = np.full(shape=(X.shape[0] // 2,), fill_value=2)  # Just an example group
+
+    for model in models:
+        with pytest.raises(lgb.compat.LGBMNotFittedError):
+            check_is_fitted(model)
+        if isinstance(model, lgb.LGBMRanker):
+            model.fit(X, y, group=group)
+        else:
+            model.fit(X, y)
+        np.testing.assert_array_equal(model.feature_names_in_, np.array([f"Column_{i}" for i in range(X.shape[1])]))
+
+
+def test_getting_feature_names_in_pd_input():
+    # as_frame=True means input has column names and these should propagate to fitted model
+    X, y = load_digits(n_class=2, return_X_y=True, as_frame=True)
-    # as_frame=True means input has column names and these should propagate to fitted model
-    X, y = load_digits(n_class=2, return_X_y=True, as_frame=True)
+    X, y = load_digits(n_class=2, return_X_y=True, as_frame=True)
+    col_names = X.columns
+    assert isinstance(col_names, list) and all(isinstance(c, str) for c in col_names), "input data must have feature names for this test to cover the expected functionality"
-    # as_frame=True means input has column names and these should propagate to fitted model
-    X, y = load_digits(n_class=2, return_X_y=True, as_frame=True)
+    X, y = load_digits(n_class=2, return_X_y=True, as_frame=True)
+    col_names = X.columns
+    assert isinstance(col_names, list) and all(isinstance(c, str) for c in col_names), "input data must have feature names for this test to cover the expected functionality"
+    est = lgb.LGBMModel(n_estimators=5, objective="binary")
+    clf = lgb.LGBMClassifier(n_estimators=5)
+    reg = lgb.LGBMRegressor(n_estimators=5)
+    rnk = lgb.LGBMRanker(n_estimators=5)
+    models = (est, clf, reg, rnk)
+    group = np.full(shape=(X.shape[0] // 2,), fill_value=2)  # Just an example group
+
+    for model in models:
+        with pytest.raises(lgb.compat.LGBMNotFittedError):
+            check_is_fitted(model)
+        if isinstance(model, lgb.LGBMRanker):
+            model.fit(X, y, group=group)
+        else:
+            model.fit(X, y)
+        np.testing.assert_array_equal(est.feature_names_in_, X.columns)
-        np.testing.assert_array_equal(est.feature_names_in_, X.columns)
+        np.testing.assert_array_equal(model.feature_names_in_, X.columns)
 @pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker]) 
-        np.testing.assert_array_equal(est.feature_names_in_, X.columns)
+        np.testing.assert_array_equal(model.feature_names_in_, X.columns)
 @pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker]) 
+
+
+def test_get_feature_names_out_np_input():
+    # input is a numpy array, which doesn't have feature names. LightGBM adds
+    # feature names to the fitted model, which is inconsistent with sklearn's behavior
+    X, y = load_digits(n_class=2, return_X_y=True)
+    est = lgb.LGBMModel(n_estimators=5, objective="binary")
+    clf = lgb.LGBMClassifier(n_estimators=5)
+    reg = lgb.LGBMRegressor(n_estimators=5)
+    rnk = lgb.LGBMRanker(n_estimators=5)
+    models = (est, clf, reg, rnk)
+    group = np.full(shape=(X.shape[0] // 2,), fill_value=2)  # Just an example group
-    group = np.full(shape=(X.shape[0] // 2,), fill_value=2)  # Just an example group
+    group = [X.shape[0]]
-    group = np.full(shape=(X.shape[0] // 2,), fill_value=2)  # Just an example group
+    group = [X.shape[0]]
+
+    for model in models:
+        with pytest.raises(lgb.compat.LGBMNotFittedError):
+            check_is_fitted(model)
+        if isinstance(model, lgb.LGBMRanker):
+            model.fit(X, y, group=group)
+        else:
+            model.fit(X, y)
+        np.testing.assert_array_equal(
+            model.get_feature_names_out(), np.array([f"Column_{i}" for i in range(X.shape[1])])
+        )
+
+
+def test_get_feature_names_out_pd_input():
+    # as_frame=True means input has column names and these should propagate to fitted model
+    X, y = load_digits(n_class=2, return_X_y=True, as_frame=True)
+    est = lgb.LGBMModel(n_estimators=5, objective="binary")
+    clf = lgb.LGBMClassifier(n_estimators=5)
+    reg = lgb.LGBMRegressor(n_estimators=5)
+    rnk = lgb.LGBMRanker(n_estimators=5)
+    models = (est, clf, reg, rnk)
+    group = np.full(shape=(X.shape[0] // 2,), fill_value=2)  # Just an example group
+
+    for model in models:
+        with pytest.raises(lgb.compat.LGBMNotFittedError):
+            check_is_fitted(model)
+        if isinstance(model, lgb.LGBMRanker):
+            model.fit(X, y, group=group)
+        else:
+            model.fit(X, y)
+        np.testing.assert_array_equal(model.get_feature_names_out(), X.columns)
+
+
 @parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()])
 def test_sklearn_integration(estimator, check):
     estimator.set_params(min_child_samples=1, min_data_in_bin=1)