Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python-package] Add feature_names_in_ attribute for scikit-learn estimators (fixes #6279) #6310

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
8209ffa
expose feature_name_ via sklearn consistent attribute feature_names_in_
nicklamiller Feb 12, 2024
52835d8
fix docstring
nicklamiller Feb 13, 2024
adc7683
raise error if estimator not fitted
nicklamiller Feb 13, 2024
08e67aa
ensure exact feature match for feature_names_in_ attribute
nicklamiller Mar 17, 2024
0ecc337
add test for numpy input
nicklamiller Mar 28, 2024
c110c9d
add test for pandas input with feature names
nicklamiller Mar 28, 2024
a8a5631
add documentation for when input data has no feature names
nicklamiller Mar 28, 2024
4e1f1dc
pre-commit fixes
nicklamiller Mar 28, 2024
b826426
feature_names_in_ returns a 1D numpy array
nicklamiller May 31, 2024
fd1ce7c
test LGBMModel, LGBMClassifier, LGBMRegressor, LGBMRanker
nicklamiller May 31, 2024
edd951a
rearrange feature name property docstrings
nicklamiller May 31, 2024
25888c6
add get_feature_names_out method
nicklamiller Jun 1, 2024
574d9ce
format reference to .feature_name_ with ticks
nicklamiller Jun 1, 2024
e55474f
Merge branch 'master' into add-sklearn-feature-attributes
nicklamiller Jun 6, 2024
8ac21d3
remove get_feature_names_out method, tidy up tests
nicklamiller Jun 11, 2024
318c3a4
Merge branch 'master' into add-sklearn-feature-attributes
nicklamiller Jun 13, 2024
be2bed0
Merge branch 'master' into add-sklearn-feature-attributes
nicklamiller Jun 14, 2024
d34d48f
Merge branch 'master' into add-sklearn-feature-attributes
nicklamiller Jun 21, 2024
346fb78
Merge branch 'master' into add-sklearn-feature-attributes
nicklamiller Jun 21, 2024
11c8334
Merge branch 'master' into add-sklearn-feature-attributes
nicklamiller Jun 22, 2024
a8ddc66
Merge branch 'master' into add-sklearn-feature-attributes
jameslamb Jul 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion python-package/lightgbm/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -1144,11 +1144,23 @@ def feature_importances_(self) -> np.ndarray:

@property
def feature_name_(self) -> List[str]:
""":obj:`list` of shape = [n_features]: The names of features."""
""":obj:`list` of shape = [n_features]: The names of features.

.. note::

If input does not contain feature names, they will be added during fitting in the format ``Column_0``, ``Column_1``, ..., ``Column_N``.
jameslamb marked this conversation as resolved.
Show resolved Hide resolved
"""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError("No feature_name found. Need to call fit beforehand.")
return self._Booster.feature_name() # type: ignore[union-attr]

@property
def feature_names_in_(self) -> np.ndarray:
""":obj:`array` of shape = [n_features]: scikit-learn compatible version of ``.feature_name_``."""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError("No feature_names_in_ found. Need to call fit beforehand.")
return np.array(self.feature_name_)


class LGBMRegressor(_LGBMRegressorBase, LGBMModel):
"""LightGBM regressor."""
Expand Down
40 changes: 40 additions & 0 deletions tests/python_package_test/test_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -1290,6 +1290,46 @@ def test_max_depth_warning_is_never_raised(capsys, estimator_class, max_depth):
assert "Provided parameters constrain tree depth" not in capsys.readouterr().out


@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker])
def test_getting_feature_names_in_np_input(estimator_class):
# input is a numpy array, which doesn't have feature names. LightGBM adds
# feature names to the fitted model, which is inconsistent with sklearn's behavior
X, y = load_digits(n_class=2, return_X_y=True)
params = {"n_estimators": 2, "num_leaves": 7}
if estimator_class is lgb.LGBMModel:
model = estimator_class(**{**params, "objective": "binary"})
else:
model = estimator_class(**params)
with pytest.raises(lgb.compat.LGBMNotFittedError):
check_is_fitted(model)
if isinstance(model, lgb.LGBMRanker):
model.fit(X, y, group=[X.shape[0]])
else:
model.fit(X, y)
np.testing.assert_array_equal(model.feature_names_in_, np.array([f"Column_{i}" for i in range(X.shape[1])]))


@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker])
def test_getting_feature_names_in_pd_input(estimator_class):
X, y = load_digits(n_class=2, return_X_y=True, as_frame=True)
col_names = X.columns.to_list()
assert isinstance(col_names, list) and all(
isinstance(c, str) for c in col_names
), "input data must have feature names for this test to cover the expected functionality"
params = {"n_estimators": 2, "num_leaves": 7}
if estimator_class is lgb.LGBMModel:
model = estimator_class(**{**params, "objective": "binary"})
else:
model = estimator_class(**params)
with pytest.raises(lgb.compat.LGBMNotFittedError):
check_is_fitted(model)
if isinstance(model, lgb.LGBMRanker):
model.fit(X, y, group=[X.shape[0]])
else:
model.fit(X, y)
np.testing.assert_array_equal(model.feature_names_in_, X.columns)


@parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()])
def test_sklearn_integration(estimator, check):
estimator.set_params(min_child_samples=1, min_data_in_bin=1)
Expand Down
Loading