From 0f64c13169f415a3b70c642f2e1dfdbd16a225a9 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sat, 31 May 2025 17:01:14 +0400 Subject: [PATCH 1/4] ENH: Implement DataFrame.select --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/frame.py | 113 ++++++++++++++++++++++ pandas/tests/frame/methods/test_select.py | 85 ++++++++++++++++ 3 files changed, 199 insertions(+) create mode 100644 pandas/tests/frame/methods/test_select.py diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 099e5bc48353a..65b8513d5ce56 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -33,6 +33,7 @@ Other enhancements - :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) - Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`) - Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`) +- Added new :meth:`DataFrame.select` method to select a subset of columns from the :class:`DataFrame` (:issue:`61522`) - :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b2c1e38f61f4c..4a603ba474a40 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4479,6 +4479,119 @@ def _get_item(self, item: Hashable) -> Series: # ---------------------------------------------------------------------- # Unsorted + def select(self, *args): + """ + Select a subset of columns from the DataFrame. + + Select can be used to return a DataFrame with some specific columns. + This can be used to remove unwanted columns, as well as to return a + DataFrame with the columns sorted in a specific order. + + Parameters + ---------- + *args : hashable or tuple of hashable + The names or the columns to return. In general this will be strings, + but pandas supports other types of column names, if they are hashable. + + Returns + ------- + DataFrame + The DataFrame with the selected columns. + + See Also + -------- + DataFrame.filter : To return a subset of rows, instead of a subset of columns. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "first_name": ["John", "Alice", "Bob"], + ... "last_name": ["Smith", "Cooper", "Marley"], + ... "age": [61, 22, 35], + ... } + ... ) + + Select a subset of columns: + + >>> df.select("first_name", "age") + first_name age + 0 John 61 + 1 Alice 22 + 2 Bob 35 + + Selecting with a pattern can be done with Python expressions: + + >>> df.select(*[col for col in df.columns if col.endswith("_name")]) + first_name last_name + 0 John Smith + 1 Alice Cooper + 2 Bob Marley + + All columns can be selected, but in a different order: + + >>> df.select("last_name", "first_name", "age") + last_name first_name age + 0 Smith John 61 + 1 Cooper Alice 22 + 2 Marley Bob 35 + + In case the columns are in a list, Python unpacking with star can be used: + + >>> columns = ["last_name", "age"] + >>> df.select(*columns) + last_name age + 0 Smith 61 + 1 Cooper 22 + 2 Marley 35 + + Note that a DataFrame is always returned. If a single column is requested, a + DataFrame with a single column is returned, not a Series: + + >>> df.select("age") + age + 0 61 + 1 22 + 2 35 + + The ``select`` method also works when columns are a ``MultiIndex``: + + >>> df = pd.DataFrame( + ... [("John", "Smith", 61), ("Alice", "Cooper", 22), ("Bob", "Marley", 35)], + ... columns=pd.MultiIndex.from_tuples( + ... [("names", "first_name"), ("names", "last_name"), ("other", "age")] + ... ), + ... ) + + If just column names are provided, they will select from the first level of the + ``MultiIndex``: + + >>> df.select("names") + names + first_name last_name + 0 John Smith + 1 Alice Cooper + 2 Bob Marley + + To select from multiple or all levels, tuples can be provided: + + >>> df.select(("names", "last_name"), ("other", "age")) + names other + last_name age + 0 Smith 61 + 1 Cooper 22 + 2 Marley 35 + """ + if args and isinstance(args[0], list): + raise ValueError( + "`DataFrame.select` does not support a list. Please use " + "`df.select('col1', 'col2',...)` or `df.select(*['col1', 'col2',...])` " + "instead" + ) + + indexer = self.columns._get_indexer_strict(list(args), "columns")[1] + return self.take(indexer, axis=1) + @overload def query( self, diff --git a/pandas/tests/frame/methods/test_select.py b/pandas/tests/frame/methods/test_select.py new file mode 100644 index 0000000000000..accf3ea336e18 --- /dev/null +++ b/pandas/tests/frame/methods/test_select.py @@ -0,0 +1,85 @@ +import pytest + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + + +@pytest.fixture +def regular_df(): + return DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8]}) + + +@pytest.fixture +def multiindex_df(): + return DataFrame( + [(0, 2, 4), (1, 3, 5)], + columns=pd.MultiIndex.from_tuples([("A", "c"), ("A", "d"), ("B", "e")]), + ) + + +class TestSelect: + def test_select_subset_cols(self, regular_df): + expected = DataFrame({"a": [1, 2], "c": [5, 6]}) + result = regular_df.select("a", "c") + tm.assert_frame_equal(result, expected) + + def test_single_value(self, regular_df): + expected = DataFrame({"a": [1, 2]}) + result = regular_df.select("a") + assert isinstance(result, DataFrame) + tm.assert_frame_equal(result, expected) + + def test_select_change_order(self, regular_df): + expected = DataFrame({"b": [3, 4], "d": [7, 8], "a": [1, 2], "c": [5, 6]}) + result = regular_df.select("b", "d", "a", "c") + tm.assert_frame_equal(result, expected) + + def test_select_none(self, regular_df): + result = regular_df.select() + assert result.empty + + def test_select_duplicated(self, regular_df): + expected = ["a", "d", "a"] + result = regular_df.select("a", "d", "a") + assert result.columns.tolist() == expected + + def test_select_list(self, regular_df): + with pytest.raises(ValueError, match="does not support a list"): + regular_df.select(["a", "b"]) + + def test_select_missing(self, regular_df): + with pytest.raises(KeyError, match=r"None of .* are in the \[columns\]"): + regular_df.select("z") + + def test_select_not_hashable(self, regular_df): + with pytest.raises(TypeError, match="unhashable type"): + regular_df.select(set()) + + def test_select_multiindex_one_level(self, multiindex_df): + expected = DataFrame( + [(0, 2), (1, 3)], + columns=pd.MultiIndex.from_tuples([("A", "c"), ("A", "d")]), + ) + result = multiindex_df.select("A") + tm.assert_frame_equal(result, expected) + + def test_select_multiindex_single_column(self, multiindex_df): + expected = DataFrame( + [(2,), (3,)], columns=pd.MultiIndex.from_tuples([("A", "d")]) + ) + result = multiindex_df.select(("A", "d")) + assert isinstance(result, DataFrame) + tm.assert_frame_equal(result, expected) + + def test_select_multiindex_multiple_columns(self, multiindex_df): + expected = DataFrame( + [(0, 4), (1, 5)], + columns=pd.MultiIndex.from_tuples([("A", "c"), ("B", "e")]), + ) + result = multiindex_df.select(("A", "c"), ("B", "e")) + tm.assert_frame_equal(result, expected) + + def test_select_multiindex_missing(self, multiindex_df): + with pytest.raises(KeyError, match="not in index"): + multiindex_df.select("Z") From bf2a9ea1c3057bf53b806e3fc369429edb5ea55a Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Thu, 12 Jun 2025 12:07:59 +0200 Subject: [PATCH 2/4] Making select work with a list parameter --- pandas/core/frame.py | 46 +++++++++++++---------- pandas/tests/frame/methods/test_select.py | 19 ++++++++-- 2 files changed, 42 insertions(+), 23 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4a603ba474a40..09b85a0bb2037 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4489,9 +4489,11 @@ def select(self, *args): Parameters ---------- - *args : hashable or tuple of hashable - The names or the columns to return. In general this will be strings, + *args : hashable or a single list arg of hashable + The names of the columns to return. In general this will be strings, but pandas supports other types of column names, if they are hashable. + If only one argument of type list is provided, the elements of the + list will be considered the named of the columns to be returned Returns ------- @@ -4520,9 +4522,17 @@ def select(self, *args): 1 Alice 22 2 Bob 35 + A list can also be used to specify the names of the columns to return: + + >>> df.select(["last_name", "age"]) + last_name age + 0 Smith 61 + 1 Cooper 22 + 2 Marley 35 + Selecting with a pattern can be done with Python expressions: - >>> df.select(*[col for col in df.columns if col.endswith("_name")]) + >>> df.select([col for col in df.columns if col.endswith("_name")]) first_name last_name 0 John Smith 1 Alice Cooper @@ -4536,15 +4546,6 @@ def select(self, *args): 1 Cooper Alice 22 2 Marley Bob 35 - In case the columns are in a list, Python unpacking with star can be used: - - >>> columns = ["last_name", "age"] - >>> df.select(*columns) - last_name age - 0 Smith 61 - 1 Cooper 22 - 2 Marley 35 - Note that a DataFrame is always returned. If a single column is requested, a DataFrame with a single column is returned, not a Series: @@ -4563,8 +4564,8 @@ def select(self, *args): ... ), ... ) - If just column names are provided, they will select from the first level of the - ``MultiIndex``: + If column names are provided, they will select from the first level of + the ``MultiIndex``: >>> df.select("names") names @@ -4573,7 +4574,7 @@ def select(self, *args): 1 Alice Cooper 2 Bob Marley - To select from multiple or all levels, tuples can be provided: + To select from multiple or all levels, tuples can be used: >>> df.select(("names", "last_name"), ("other", "age")) names other @@ -4583,11 +4584,16 @@ def select(self, *args): 2 Marley 35 """ if args and isinstance(args[0], list): - raise ValueError( - "`DataFrame.select` does not support a list. Please use " - "`df.select('col1', 'col2',...)` or `df.select(*['col1', 'col2',...])` " - "instead" - ) + if len(args) == 1: + args = args[0] + else: + raise ValueError( + "`DataFrame.select` supports individual columns " + "`df.select('col1', 'col2',...)` or a list " + "`df.select(['col1', 'col2',...])`, but not both. " + "You can unpack the list if you have a mix: " + "`df.select(*['col1', 'col2'], 'col3')`." + ) indexer = self.columns._get_indexer_strict(list(args), "columns")[1] return self.take(indexer, axis=1) diff --git a/pandas/tests/frame/methods/test_select.py b/pandas/tests/frame/methods/test_select.py index accf3ea336e18..6aab179e3644b 100644 --- a/pandas/tests/frame/methods/test_select.py +++ b/pandas/tests/frame/methods/test_select.py @@ -44,9 +44,14 @@ def test_select_duplicated(self, regular_df): result = regular_df.select("a", "d", "a") assert result.columns.tolist() == expected - def test_select_list(self, regular_df): - with pytest.raises(ValueError, match="does not support a list"): - regular_df.select(["a", "b"]) + def test_select_single_list(self, regular_df): + expected = DataFrame({"a": [1, 2], "c": [5, 6]}) + result = regular_df.select(["a", "c"]) + tm.assert_frame_equal(result, expected) + + def test_select_list_and_string(self, regular_df): + with pytest.raises(ValueError, match="supports individual columns"): + regular_df.select(["a", "c"], "b") def test_select_missing(self, regular_df): with pytest.raises(KeyError, match=r"None of .* are in the \[columns\]"): @@ -80,6 +85,14 @@ def test_select_multiindex_multiple_columns(self, multiindex_df): result = multiindex_df.select(("A", "c"), ("B", "e")) tm.assert_frame_equal(result, expected) + def test_select_multiindex_multiple_columns_as_list(self, multiindex_df): + expected = DataFrame( + [(0, 4), (1, 5)], + columns=pd.MultiIndex.from_tuples([("A", "c"), ("B", "e")]), + ) + result = multiindex_df.select([("A", "c"), ("B", "e")]) + tm.assert_frame_equal(result, expected) + def test_select_multiindex_missing(self, multiindex_df): with pytest.raises(KeyError, match="not in index"): multiindex_df.select("Z") From 92cb1e74e4b603b1021c88a27ee4b618aa2225e8 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Fri, 13 Jun 2025 11:42:29 +0200 Subject: [PATCH 3/4] Improve docs --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 09b85a0bb2037..e13eeb42b8877 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4484,7 +4484,7 @@ def select(self, *args): Select a subset of columns from the DataFrame. Select can be used to return a DataFrame with some specific columns. - This can be used to remove unwanted columns, as well as to return a + This can be select a subset of the columns, as well as to return a DataFrame with the columns sorted in a specific order. Parameters @@ -4493,7 +4493,7 @@ def select(self, *args): The names of the columns to return. In general this will be strings, but pandas supports other types of column names, if they are hashable. If only one argument of type list is provided, the elements of the - list will be considered the named of the columns to be returned + list will be considered the names of the columns to be returned Returns ------- From 527d1d7bd7757049c3db41828827e12473953799 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Fri, 13 Jun 2025 13:27:40 +0200 Subject: [PATCH 4/4] Typing --- pandas/core/frame.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e13eeb42b8877..627f549681a99 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4585,7 +4585,7 @@ def select(self, *args): """ if args and isinstance(args[0], list): if len(args) == 1: - args = args[0] + columns = args[0] else: raise ValueError( "`DataFrame.select` supports individual columns " @@ -4594,8 +4594,10 @@ def select(self, *args): "You can unpack the list if you have a mix: " "`df.select(*['col1', 'col2'], 'col3')`." ) + else: + columns = list(args) - indexer = self.columns._get_indexer_strict(list(args), "columns")[1] + indexer = self.columns._get_indexer_strict(columns, "columns")[1] return self.take(indexer, axis=1) @overload