Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Fix read_csv MultiIndex empty value handling (#59560) #60487

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,7 @@ MultiIndex
- :meth:`DataFrame.melt` would not accept multiple names in ``var_name`` when the columns were a :class:`MultiIndex` (:issue:`58033`)
- :meth:`MultiIndex.insert` would not insert NA value correctly at unified location of index -1 (:issue:`59003`)
- :func:`MultiIndex.get_level_values` accessing a :class:`DatetimeIndex` does not carry the frequency attribute along (:issue:`58327`, :issue:`57949`)
- :func:`read_csv` now handles empty values in :class:`MultiIndex` columns and indexes consistently, replacing them with empty strings instead of "Unnamed: ..." when uniqueness can be ensured. (:issue:`59560`)
-

I/O
Expand Down
22 changes: 19 additions & 3 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,19 @@ def extract(r):
return tuple(r[i] for i in range(field_count) if i not in sic)

columns = list(zip(*(extract(r) for r in header)))
# Replace None, empty strings, or column names starting with 'Unnamed: '
# (used as placeholders in multi-index headers) with empty strings.
columns = [
tuple(
""
if level is None
or str(level).strip() == ""
or (isinstance(level, str) and level.startswith("Unnamed: "))
else level
for level in col
)
for col in columns
]
names = columns.copy()
for single_ic in sorted(ic):
names.insert(single_ic, single_ic)
Expand Down Expand Up @@ -357,7 +370,7 @@ def _agg_index(self, index) -> Index:
)
else:
col_na_values, col_na_fvalues = set(), set()

col_na_values.discard("")
cast_type = None
index_converter = False
if self.index_names is not None:
Expand Down Expand Up @@ -694,8 +707,11 @@ def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, lis

# Only clean index names that were placeholders.
for i, name in enumerate(index_names):
if isinstance(name, str) and name in self.unnamed_cols:
index_names[i] = None
if isinstance(name, str):
if name.strip() == "":
index_names[i] = ""
elif name in self.unnamed_cols:
index_names[i] = None

return index_names, columns, index_col

Expand Down
21 changes: 21 additions & 0 deletions pandas/tests/io/parser/test_index_col.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,3 +375,24 @@ def test_multiindex_columns_not_leading_index_col(all_parsers):
)
expected = DataFrame([["x", 1, 2]], columns=cols, index=["y"])
tm.assert_frame_equal(result, expected)


def test_multiindex_empty_values_handling(all_parsers):
# GH#59560
parser = all_parsers
if parser.engine == "pyarrow":
pytest.skip(
"PyArrow engine does not support multiple header rows for MultiIndex cols."
)

data = ", ,a,b,b\n" ", ,, ,b2\n" "i1,,0,1,2\n" "i2,,3,4,5\n"
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1])
expected_columns = MultiIndex.from_tuples(
[("a", ""), ("b", ""), ("b", "b2")], names=[None, None]
)
expected = DataFrame(
[[0, 1, 2], [3, 4, 5]],
index=MultiIndex.from_tuples([("i1", ""), ("i2", "")]),
columns=expected_columns,
)
tm.assert_frame_equal(result, expected)
Loading