Fixed failed pre-commit.ci hooks : Formatting errors in algorithms.py…

…, inconsistent-namespace-usage in test_isin.py, sorted whatsnew entry
pandas-dev · Jan 20, 2025 · cb16826 · cb16826
1 parent dbe3673
commit cb16826
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 21 deletions.
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -800,11 +800,11 @@ Other
 - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`)
 - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`)
 - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`)
+- Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`)
 - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`)
 - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`)
 - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` throwing ``ValueError`` when ``regex=True`` and all NA values. (:issue:`60688`)
 - Bug in :meth:`Series.to_string` when series contains complex floats with exponents (:issue:`60405`)
-- Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`)
 - Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`)
 - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)
 - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -23,9 +23,7 @@
     iNaT,
     lib,
 )
-
 from pandas._libs.missing import NA
-
 from pandas._typing import (
     AnyArrayLike,
     ArrayLike,
@@ -543,23 +541,24 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]:
     elif isinstance(values.dtype, ExtensionDtype):
         return isin(np.asarray(comps_array), np.asarray(values))
 
-    # GH16012
-    # Ensure np.isin doesn't get object types or it *may* throw an exception
-    # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array),
-    # isin is faster for small sizes
-
     # GH60678
     # Ensure values don't contain <NA>, otherwise it throws exception with np.in1d
+
     values_contains_NA = False
-    
-    if comps_array.dtype != object and len(values) <= 26:  
+
+    if comps_array.dtype != object and len(values) <= 26:
         values_contains_NA = any(v is NA for v in values)
 
+    # GH16012
+    # Ensure np.isin doesn't get object types or it *may* throw an exception
+    # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array),
+    # isin is faster for small sizes
+
     if (
         len(comps_array) > _MINIMUM_COMP_ARR_LEN
         and len(values) <= 26
         and comps_array.dtype != object
-        and values_contains_NA == False
+        and not values_contains_NA
     ):
         # If the values include nan we need to check for nan explicitly
         # since np.nan it not equal to np.nan

diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py
@@ -211,26 +211,30 @@ def test_isin_large_series_mixed_dtypes_and_nan(monkeypatch):
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.parametrize("dtype, data, values, expected", [
-    ("boolean", [pd.NA, False, True], [False, pd.NA], [True, True, False]),
-    ("Int64", [pd.NA, 2, 1], [1, pd.NA], [True, False, True]),
-    ("boolean", [pd.NA, False, True], [pd.NA, True, 'a', 20], [True, False, True]),
-    ("boolean", [pd.NA, False, True], [], [False, False, False]),
-    ("Float64", [20.0, 30.0, pd.NA], [pd.NA], [False, False, True]),
-])
+@pytest.mark.parametrize(
+    "dtype, data, values, expected",
+    [
+        ("boolean", [pd.NA, False, True], [False, pd.NA], [True, True, False]),
+        ("Int64", [pd.NA, 2, 1], [1, pd.NA], [True, False, True]),
+        ("boolean", [pd.NA, False, True], [pd.NA, True, "a", 20], [True, False, True]),
+        ("boolean", [pd.NA, False, True], [], [False, False, False]),
+        ("Float64", [20.0, 30.0, pd.NA], [pd.NA], [False, False, True]),
+    ],
+)
 def test_isin_large_series_and_pdNA(dtype, data, values, expected, monkeypatch):
     # https://github.com/pandas-dev/pandas/issues/60678
-    # combination of  large series (> _MINIMUM_COMP_ARR_LEN elements) and 
-    # values contains pdNA  
+    # combination of  large series (> _MINIMUM_COMP_ARR_LEN elements) and
+    # values contains pdNA
     min_isin_comp = 2
     ser = Series(data, dtype=dtype)
-    expected = pd.Series(expected, dtype="boolean")
+    expected = Series(expected, dtype="boolean")
 
     with monkeypatch.context() as m:
         m.setattr(algorithms, "_MINIMUM_COMP_ARR_LEN", min_isin_comp)
         result = ser.isin(values)
     tm.assert_series_equal(result, expected)
 
+
 def test_isin_complex_numbers():
     # GH 17927
     array = [0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j]