From 0f64c13169f415a3b70c642f2e1dfdbd16a225a9 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sat, 31 May 2025 17:01:14 +0400 Subject: [PATCH 1/2] ENH: Implement DataFrame.select --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/frame.py | 113 ++++++++++++++++++++++ pandas/tests/frame/methods/test_select.py | 85 ++++++++++++++++ 3 files changed, 199 insertions(+) create mode 100644 pandas/tests/frame/methods/test_select.py diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 099e5bc48353a..65b8513d5ce56 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -33,6 +33,7 @@ Other enhancements - :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) - Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`) - Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`) +- Added new :meth:`DataFrame.select` method to select a subset of columns from the :class:`DataFrame` (:issue:`61522`) - :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b2c1e38f61f4c..4a603ba474a40 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4479,6 +4479,119 @@ def _get_item(self, item: Hashable) -> Series: # ---------------------------------------------------------------------- # Unsorted + def select(self, *args): + """ + Select a subset of columns from the DataFrame. + + Select can be used to return a DataFrame with some specific columns. + This can be used to remove unwanted columns, as well as to return a + DataFrame with the columns sorted in a specific order. + + Parameters + ---------- + *args : hashable or tuple of hashable + The names or the columns to return. In general this will be strings, + but pandas supports other types of column names, if they are hashable. + + Returns + ------- + DataFrame + The DataFrame with the selected columns. + + See Also + -------- + DataFrame.filter : To return a subset of rows, instead of a subset of columns. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "first_name": ["John", "Alice", "Bob"], + ... "last_name": ["Smith", "Cooper", "Marley"], + ... "age": [61, 22, 35], + ... } + ... ) + + Select a subset of columns: + + >>> df.select("first_name", "age") + first_name age + 0 John 61 + 1 Alice 22 + 2 Bob 35 + + Selecting with a pattern can be done with Python expressions: + + >>> df.select(*[col for col in df.columns if col.endswith("_name")]) + first_name last_name + 0 John Smith + 1 Alice Cooper + 2 Bob Marley + + All columns can be selected, but in a different order: + + >>> df.select("last_name", "first_name", "age") + last_name first_name age + 0 Smith John 61 + 1 Cooper Alice 22 + 2 Marley Bob 35 + + In case the columns are in a list, Python unpacking with star can be used: + + >>> columns = ["last_name", "age"] + >>> df.select(*columns) + last_name age + 0 Smith 61 + 1 Cooper 22 + 2 Marley 35 + + Note that a DataFrame is always returned. If a single column is requested, a + DataFrame with a single column is returned, not a Series: + + >>> df.select("age") + age + 0 61 + 1 22 + 2 35 + + The ``select`` method also works when columns are a ``MultiIndex``: + + >>> df = pd.DataFrame( + ... [("John", "Smith", 61), ("Alice", "Cooper", 22), ("Bob", "Marley", 35)], + ... columns=pd.MultiIndex.from_tuples( + ... [("names", "first_name"), ("names", "last_name"), ("other", "age")] + ... ), + ... ) + + If just column names are provided, they will select from the first level of the + ``MultiIndex``: + + >>> df.select("names") + names + first_name last_name + 0 John Smith + 1 Alice Cooper + 2 Bob Marley + + To select from multiple or all levels, tuples can be provided: + + >>> df.select(("names", "last_name"), ("other", "age")) + names other + last_name age + 0 Smith 61 + 1 Cooper 22 + 2 Marley 35 + """ + if args and isinstance(args[0], list): + raise ValueError( + "`DataFrame.select` does not support a list. Please use " + "`df.select('col1', 'col2',...)` or `df.select(*['col1', 'col2',...])` " + "instead" + ) + + indexer = self.columns._get_indexer_strict(list(args), "columns")[1] + return self.take(indexer, axis=1) + @overload def query( self, diff --git a/pandas/tests/frame/methods/test_select.py b/pandas/tests/frame/methods/test_select.py new file mode 100644 index 0000000000000..accf3ea336e18 --- /dev/null +++ b/pandas/tests/frame/methods/test_select.py @@ -0,0 +1,85 @@ +import pytest + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + + +@pytest.fixture +def regular_df(): + return DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8]}) + + +@pytest.fixture +def multiindex_df(): + return DataFrame( + [(0, 2, 4), (1, 3, 5)], + columns=pd.MultiIndex.from_tuples([("A", "c"), ("A", "d"), ("B", "e")]), + ) + + +class TestSelect: + def test_select_subset_cols(self, regular_df): + expected = DataFrame({"a": [1, 2], "c": [5, 6]}) + result = regular_df.select("a", "c") + tm.assert_frame_equal(result, expected) + + def test_single_value(self, regular_df): + expected = DataFrame({"a": [1, 2]}) + result = regular_df.select("a") + assert isinstance(result, DataFrame) + tm.assert_frame_equal(result, expected) + + def test_select_change_order(self, regular_df): + expected = DataFrame({"b": [3, 4], "d": [7, 8], "a": [1, 2], "c": [5, 6]}) + result = regular_df.select("b", "d", "a", "c") + tm.assert_frame_equal(result, expected) + + def test_select_none(self, regular_df): + result = regular_df.select() + assert result.empty + + def test_select_duplicated(self, regular_df): + expected = ["a", "d", "a"] + result = regular_df.select("a", "d", "a") + assert result.columns.tolist() == expected + + def test_select_list(self, regular_df): + with pytest.raises(ValueError, match="does not support a list"): + regular_df.select(["a", "b"]) + + def test_select_missing(self, regular_df): + with pytest.raises(KeyError, match=r"None of .* are in the \[columns\]"): + regular_df.select("z") + + def test_select_not_hashable(self, regular_df): + with pytest.raises(TypeError, match="unhashable type"): + regular_df.select(set()) + + def test_select_multiindex_one_level(self, multiindex_df): + expected = DataFrame( + [(0, 2), (1, 3)], + columns=pd.MultiIndex.from_tuples([("A", "c"), ("A", "d")]), + ) + result = multiindex_df.select("A") + tm.assert_frame_equal(result, expected) + + def test_select_multiindex_single_column(self, multiindex_df): + expected = DataFrame( + [(2,), (3,)], columns=pd.MultiIndex.from_tuples([("A", "d")]) + ) + result = multiindex_df.select(("A", "d")) + assert isinstance(result, DataFrame) + tm.assert_frame_equal(result, expected) + + def test_select_multiindex_multiple_columns(self, multiindex_df): + expected = DataFrame( + [(0, 4), (1, 5)], + columns=pd.MultiIndex.from_tuples([("A", "c"), ("B", "e")]), + ) + result = multiindex_df.select(("A", "c"), ("B", "e")) + tm.assert_frame_equal(result, expected) + + def test_select_multiindex_missing(self, multiindex_df): + with pytest.raises(KeyError, match="not in index"): + multiindex_df.select("Z") From bf2a9ea1c3057bf53b806e3fc369429edb5ea55a Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Thu, 12 Jun 2025 12:07:59 +0200 Subject: [PATCH 2/2] Making select work with a list parameter --- pandas/core/frame.py | 46 +++++++++++++---------- pandas/tests/frame/methods/test_select.py | 19 ++++++++-- 2 files changed, 42 insertions(+), 23 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4a603ba474a40..09b85a0bb2037 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4489,9 +4489,11 @@ def select(self, *args): Parameters ---------- - *args : hashable or tuple of hashable - The names or the columns to return. In general this will be strings, + *args : hashable or a single list arg of hashable + The names of the columns to return. In general this will be strings, but pandas supports other types of column names, if they are hashable. + If only one argument of type list is provided, the elements of the + list will be considered the named of the columns to be returned Returns ------- @@ -4520,9 +4522,17 @@ def select(self, *args): 1 Alice 22 2 Bob 35 + A list can also be used to specify the names of the columns to return: + + >>> df.select(["last_name", "age"]) + last_name age + 0 Smith 61 + 1 Cooper 22 + 2 Marley 35 + Selecting with a pattern can be done with Python expressions: - >>> df.select(*[col for col in df.columns if col.endswith("_name")]) + >>> df.select([col for col in df.columns if col.endswith("_name")]) first_name last_name 0 John Smith 1 Alice Cooper @@ -4536,15 +4546,6 @@ def select(self, *args): 1 Cooper Alice 22 2 Marley Bob 35 - In case the columns are in a list, Python unpacking with star can be used: - - >>> columns = ["last_name", "age"] - >>> df.select(*columns) - last_name age - 0 Smith 61 - 1 Cooper 22 - 2 Marley 35 - Note that a DataFrame is always returned. If a single column is requested, a DataFrame with a single column is returned, not a Series: @@ -4563,8 +4564,8 @@ def select(self, *args): ... ), ... ) - If just column names are provided, they will select from the first level of the - ``MultiIndex``: + If column names are provided, they will select from the first level of + the ``MultiIndex``: >>> df.select("names") names @@ -4573,7 +4574,7 @@ def select(self, *args): 1 Alice Cooper 2 Bob Marley - To select from multiple or all levels, tuples can be provided: + To select from multiple or all levels, tuples can be used: >>> df.select(("names", "last_name"), ("other", "age")) names other @@ -4583,11 +4584,16 @@ def select(self, *args): 2 Marley 35 """ if args and isinstance(args[0], list): - raise ValueError( - "`DataFrame.select` does not support a list. Please use " - "`df.select('col1', 'col2',...)` or `df.select(*['col1', 'col2',...])` " - "instead" - ) + if len(args) == 1: + args = args[0] + else: + raise ValueError( + "`DataFrame.select` supports individual columns " + "`df.select('col1', 'col2',...)` or a list " + "`df.select(['col1', 'col2',...])`, but not both. " + "You can unpack the list if you have a mix: " + "`df.select(*['col1', 'col2'], 'col3')`." + ) indexer = self.columns._get_indexer_strict(list(args), "columns")[1] return self.take(indexer, axis=1) diff --git a/pandas/tests/frame/methods/test_select.py b/pandas/tests/frame/methods/test_select.py index accf3ea336e18..6aab179e3644b 100644 --- a/pandas/tests/frame/methods/test_select.py +++ b/pandas/tests/frame/methods/test_select.py @@ -44,9 +44,14 @@ def test_select_duplicated(self, regular_df): result = regular_df.select("a", "d", "a") assert result.columns.tolist() == expected - def test_select_list(self, regular_df): - with pytest.raises(ValueError, match="does not support a list"): - regular_df.select(["a", "b"]) + def test_select_single_list(self, regular_df): + expected = DataFrame({"a": [1, 2], "c": [5, 6]}) + result = regular_df.select(["a", "c"]) + tm.assert_frame_equal(result, expected) + + def test_select_list_and_string(self, regular_df): + with pytest.raises(ValueError, match="supports individual columns"): + regular_df.select(["a", "c"], "b") def test_select_missing(self, regular_df): with pytest.raises(KeyError, match=r"None of .* are in the \[columns\]"): @@ -80,6 +85,14 @@ def test_select_multiindex_multiple_columns(self, multiindex_df): result = multiindex_df.select(("A", "c"), ("B", "e")) tm.assert_frame_equal(result, expected) + def test_select_multiindex_multiple_columns_as_list(self, multiindex_df): + expected = DataFrame( + [(0, 4), (1, 5)], + columns=pd.MultiIndex.from_tuples([("A", "c"), ("B", "e")]), + ) + result = multiindex_df.select([("A", "c"), ("B", "e")]) + tm.assert_frame_equal(result, expected) + def test_select_multiindex_missing(self, multiindex_df): with pytest.raises(KeyError, match="not in index"): multiindex_df.select("Z")