해당 서비스 도커화 성공, 룰 추가, 로그인 오류 수정, 소문자 룰 어느정도 해결

This commit is contained in:
Hyungi Ahn
2025-08-01 15:55:27 +09:00
parent ef06cec8d6
commit 809b2af53e
6418 changed files with 1922672 additions and 69 deletions

View File

@@ -0,0 +1,252 @@
import shlex
import subprocess
import time
import uuid
import pytest
from pandas.compat import (
is_ci_environment,
is_platform_arm,
is_platform_mac,
is_platform_windows,
)
import pandas.util._test_decorators as td
import pandas.io.common as icom
from pandas.io.parsers import read_csv
@pytest.fixture
def compression_to_extension():
return {value: key for key, value in icom.extension_to_compression.items()}
@pytest.fixture
def tips_file(datapath):
"""Path to the tips dataset"""
return datapath("io", "data", "csv", "tips.csv")
@pytest.fixture
def jsonl_file(datapath):
"""Path to a JSONL dataset"""
return datapath("io", "parser", "data", "items.jsonl")
@pytest.fixture
def salaries_table(datapath):
"""DataFrame with the salaries dataset"""
return read_csv(datapath("io", "parser", "data", "salaries.csv"), sep="\t")
@pytest.fixture
def feather_file(datapath):
return datapath("io", "data", "feather", "feather-0_3_1.feather")
@pytest.fixture
def xml_file(datapath):
return datapath("io", "data", "xml", "books.xml")
@pytest.fixture
def s3so(worker_id):
if is_ci_environment():
url = "http://localhost:5000/"
else:
worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw")
url = f"http://127.0.0.1:555{worker_id}/"
return {"client_kwargs": {"endpoint_url": url}}
@pytest.fixture(scope="function" if is_ci_environment() else "session")
def monkeysession():
with pytest.MonkeyPatch.context() as mp:
yield mp
@pytest.fixture(scope="function" if is_ci_environment() else "session")
def s3_base(worker_id, monkeysession):
"""
Fixture for mocking S3 interaction.
Sets up moto server in separate process locally
Return url for motoserver/moto CI service
"""
pytest.importorskip("s3fs")
pytest.importorskip("boto3")
# temporary workaround as moto fails for botocore >= 1.11 otherwise,
# see https://github.com/spulec/moto/issues/1924 & 1952
monkeysession.setenv("AWS_ACCESS_KEY_ID", "foobar_key")
monkeysession.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret")
if is_ci_environment():
if is_platform_arm() or is_platform_mac() or is_platform_windows():
# NOT RUN on Windows/macOS/ARM, only Ubuntu
# - subprocess in CI can cause timeouts
# - GitHub Actions do not support
# container services for the above OSs
# - CircleCI will probably hit the Docker rate pull limit
pytest.skip(
"S3 tests do not have a corresponding service in "
"Windows, macOS or ARM platforms"
)
else:
yield "http://localhost:5000"
else:
requests = pytest.importorskip("requests")
pytest.importorskip("moto", minversion="1.3.14")
pytest.importorskip("flask") # server mode needs flask too
# Launching moto in server mode, i.e., as a separate process
# with an S3 endpoint on localhost
worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw")
endpoint_port = f"555{worker_id}"
endpoint_uri = f"http://127.0.0.1:{endpoint_port}/"
# pipe to null to avoid logging in terminal
with subprocess.Popen(
shlex.split(f"moto_server s3 -p {endpoint_port}"),
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
) as proc:
timeout = 5
while timeout > 0:
try:
# OK to go once server is accepting connections
r = requests.get(endpoint_uri)
if r.ok:
break
except Exception:
pass
timeout -= 0.1
time.sleep(0.1)
yield endpoint_uri
proc.terminate()
@pytest.fixture
def s3_resource(s3_base):
import boto3
s3 = boto3.resource("s3", endpoint_url=s3_base)
return s3
@pytest.fixture
def s3_public_bucket(s3_resource):
bucket = s3_resource.Bucket(f"pandas-test-{uuid.uuid4()}")
bucket.create()
yield bucket
bucket.objects.delete()
bucket.delete()
@pytest.fixture
def s3_public_bucket_with_data(
s3_public_bucket, tips_file, jsonl_file, feather_file, xml_file
):
"""
The following datasets
are loaded.
- tips.csv
- tips.csv.gz
- tips.csv.bz2
- items.jsonl
"""
test_s3_files = [
("tips#1.csv", tips_file),
("tips.csv", tips_file),
("tips.csv.gz", tips_file + ".gz"),
("tips.csv.bz2", tips_file + ".bz2"),
("items.jsonl", jsonl_file),
("simple_dataset.feather", feather_file),
("books.xml", xml_file),
]
for s3_key, file_name in test_s3_files:
with open(file_name, "rb") as f:
s3_public_bucket.put_object(Key=s3_key, Body=f)
return s3_public_bucket
@pytest.fixture
def s3_private_bucket(s3_resource):
bucket = s3_resource.Bucket(f"cant_get_it-{uuid.uuid4()}")
bucket.create(ACL="private")
yield bucket
bucket.objects.delete()
bucket.delete()
@pytest.fixture
def s3_private_bucket_with_data(
s3_private_bucket, tips_file, jsonl_file, feather_file, xml_file
):
"""
The following datasets
are loaded.
- tips.csv
- tips.csv.gz
- tips.csv.bz2
- items.jsonl
"""
test_s3_files = [
("tips#1.csv", tips_file),
("tips.csv", tips_file),
("tips.csv.gz", tips_file + ".gz"),
("tips.csv.bz2", tips_file + ".bz2"),
("items.jsonl", jsonl_file),
("simple_dataset.feather", feather_file),
("books.xml", xml_file),
]
for s3_key, file_name in test_s3_files:
with open(file_name, "rb") as f:
s3_private_bucket.put_object(Key=s3_key, Body=f)
return s3_private_bucket
_compression_formats_params = [
(".no_compress", None),
("", None),
(".gz", "gzip"),
(".GZ", "gzip"),
(".bz2", "bz2"),
(".BZ2", "bz2"),
(".zip", "zip"),
(".ZIP", "zip"),
(".xz", "xz"),
(".XZ", "xz"),
pytest.param((".zst", "zstd"), marks=td.skip_if_no("zstandard")),
pytest.param((".ZST", "zstd"), marks=td.skip_if_no("zstandard")),
]
@pytest.fixture(params=_compression_formats_params[1:])
def compression_format(request):
return request.param
@pytest.fixture(params=_compression_formats_params)
def compression_ext(request):
return request.param[0]
@pytest.fixture(
params=[
"python",
pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")),
]
)
def string_storage(request):
"""
Parametrized fixture for pd.options.mode.string_storage.
* 'python'
* 'pyarrow'
"""
return request.param

View File

@@ -0,0 +1,41 @@
import pytest
import pandas._testing as tm
from pandas.io.parsers import read_csv
@pytest.fixture
def frame(float_frame):
"""
Returns the first ten items in fixture "float_frame".
"""
return float_frame[:10]
@pytest.fixture
def tsframe():
return tm.makeTimeDataFrame()[:5]
@pytest.fixture(params=[True, False])
def merge_cells(request):
return request.param
@pytest.fixture
def df_ref(datapath):
"""
Obtain the reference data from read_csv with the Python engine.
"""
filepath = datapath("io", "data", "csv", "test1.csv")
df_ref = read_csv(filepath, index_col=0, parse_dates=True, engine="python")
return df_ref
@pytest.fixture(params=[".xls", ".xlsx", ".xlsm", ".ods", ".xlsb"])
def read_ext(request):
"""
Valid extensions for reading Excel files.
"""
return request.param

View File

@@ -0,0 +1,50 @@
import functools
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
pytest.importorskip("odf")
@pytest.fixture(autouse=True)
def cd_and_set_engine(monkeypatch, datapath):
func = functools.partial(pd.read_excel, engine="odf")
monkeypatch.setattr(pd, "read_excel", func)
monkeypatch.chdir(datapath("io", "data", "excel"))
def test_read_invalid_types_raises():
# the invalid_value_type.ods required manually editing
# of the included content.xml file
with pytest.raises(ValueError, match="Unrecognized type awesome_new_type"):
pd.read_excel("invalid_value_type.ods")
def test_read_writer_table():
# Also test reading tables from an text OpenDocument file
# (.odt)
index = pd.Index(["Row 1", "Row 2", "Row 3"], name="Header")
expected = pd.DataFrame(
[[1, np.nan, 7], [2, np.nan, 8], [3, np.nan, 9]],
index=index,
columns=["Column 1", "Unnamed: 2", "Column 3"],
)
result = pd.read_excel("writertable.odt", sheet_name="Table1", index_col=0)
tm.assert_frame_equal(result, expected)
def test_read_newlines_between_xml_elements_table():
# GH#45598
expected = pd.DataFrame(
[[1.0, 4.0, 7], [np.nan, np.nan, 8], [3.0, 6.0, 9]],
columns=["Column 1", "Column 2", "Column 3"],
)
result = pd.read_excel("test_newlines.ods")
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,49 @@
import re
import pytest
import pandas._testing as tm
from pandas.io.excel import ExcelWriter
odf = pytest.importorskip("odf")
pytestmark = pytest.mark.parametrize("ext", [".ods"])
def test_write_append_mode_raises(ext):
msg = "Append mode is not supported with odf!"
with tm.ensure_clean(ext) as f:
with pytest.raises(ValueError, match=msg):
ExcelWriter(f, engine="odf", mode="a")
@pytest.mark.parametrize("engine_kwargs", [None, {"kwarg": 1}])
def test_engine_kwargs(ext, engine_kwargs):
# GH 42286
# GH 43445
# test for error: OpenDocumentSpreadsheet does not accept any arguments
with tm.ensure_clean(ext) as f:
if engine_kwargs is not None:
error = re.escape(
"OpenDocumentSpreadsheet() got an unexpected keyword argument 'kwarg'"
)
with pytest.raises(
TypeError,
match=error,
):
ExcelWriter(f, engine="odf", engine_kwargs=engine_kwargs)
else:
with ExcelWriter(f, engine="odf", engine_kwargs=engine_kwargs) as _:
pass
def test_book_and_sheets_consistent(ext):
# GH#45687 - Ensure sheets is updated if user modifies book
with tm.ensure_clean(ext) as f:
with ExcelWriter(f) as writer:
assert writer.sheets == {}
table = odf.table.Table(name="test_name")
writer.book.spreadsheet.addElement(table)
assert writer.sheets == {"test_name": table}

View File

@@ -0,0 +1,398 @@
import contextlib
from pathlib import Path
import re
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame
import pandas._testing as tm
from pandas.io.excel import (
ExcelWriter,
_OpenpyxlWriter,
)
openpyxl = pytest.importorskip("openpyxl")
pytestmark = pytest.mark.parametrize("ext", [".xlsx"])
def test_to_excel_styleconverter(ext):
from openpyxl import styles
hstyle = {
"font": {"color": "00FF0000", "bold": True},
"borders": {"top": "thin", "right": "thin", "bottom": "thin", "left": "thin"},
"alignment": {"horizontal": "center", "vertical": "top"},
"fill": {"patternType": "solid", "fgColor": {"rgb": "006666FF", "tint": 0.3}},
"number_format": {"format_code": "0.00"},
"protection": {"locked": True, "hidden": False},
}
font_color = styles.Color("00FF0000")
font = styles.Font(bold=True, color=font_color)
side = styles.Side(style=styles.borders.BORDER_THIN)
border = styles.Border(top=side, right=side, bottom=side, left=side)
alignment = styles.Alignment(horizontal="center", vertical="top")
fill_color = styles.Color(rgb="006666FF", tint=0.3)
fill = styles.PatternFill(patternType="solid", fgColor=fill_color)
number_format = "0.00"
protection = styles.Protection(locked=True, hidden=False)
kw = _OpenpyxlWriter._convert_to_style_kwargs(hstyle)
assert kw["font"] == font
assert kw["border"] == border
assert kw["alignment"] == alignment
assert kw["fill"] == fill
assert kw["number_format"] == number_format
assert kw["protection"] == protection
def test_write_cells_merge_styled(ext):
from pandas.io.formats.excel import ExcelCell
sheet_name = "merge_styled"
sty_b1 = {"font": {"color": "00FF0000"}}
sty_a2 = {"font": {"color": "0000FF00"}}
initial_cells = [
ExcelCell(col=1, row=0, val=42, style=sty_b1),
ExcelCell(col=0, row=1, val=99, style=sty_a2),
]
sty_merged = {"font": {"color": "000000FF", "bold": True}}
sty_kwargs = _OpenpyxlWriter._convert_to_style_kwargs(sty_merged)
openpyxl_sty_merged = sty_kwargs["font"]
merge_cells = [
ExcelCell(
col=0, row=0, val="pandas", mergestart=1, mergeend=1, style=sty_merged
)
]
with tm.ensure_clean(ext) as path:
with _OpenpyxlWriter(path) as writer:
writer._write_cells(initial_cells, sheet_name=sheet_name)
writer._write_cells(merge_cells, sheet_name=sheet_name)
wks = writer.sheets[sheet_name]
xcell_b1 = wks["B1"]
xcell_a2 = wks["A2"]
assert xcell_b1.font == openpyxl_sty_merged
assert xcell_a2.font == openpyxl_sty_merged
@pytest.mark.parametrize("iso_dates", [True, False])
def test_engine_kwargs_write(ext, iso_dates):
# GH 42286 GH 43445
engine_kwargs = {"iso_dates": iso_dates}
with tm.ensure_clean(ext) as f:
with ExcelWriter(f, engine="openpyxl", engine_kwargs=engine_kwargs) as writer:
assert writer.book.iso_dates == iso_dates
# ExcelWriter won't allow us to close without writing something
DataFrame().to_excel(writer)
def test_engine_kwargs_append_invalid(ext):
# GH 43445
# test whether an invalid engine kwargs actually raises
with tm.ensure_clean(ext) as f:
DataFrame(["hello", "world"]).to_excel(f)
with pytest.raises(
TypeError,
match=re.escape(
"load_workbook() got an unexpected keyword argument 'apple_banana'"
),
):
with ExcelWriter(
f, engine="openpyxl", mode="a", engine_kwargs={"apple_banana": "fruit"}
) as writer:
# ExcelWriter needs us to write something to close properly
DataFrame(["good"]).to_excel(writer, sheet_name="Sheet2")
@pytest.mark.parametrize("data_only, expected", [(True, 0), (False, "=1+1")])
def test_engine_kwargs_append_data_only(ext, data_only, expected):
# GH 43445
# tests whether the data_only engine_kwarg actually works well for
# openpyxl's load_workbook
with tm.ensure_clean(ext) as f:
DataFrame(["=1+1"]).to_excel(f)
with ExcelWriter(
f, engine="openpyxl", mode="a", engine_kwargs={"data_only": data_only}
) as writer:
assert writer.sheets["Sheet1"]["B2"].value == expected
# ExcelWriter needs us to writer something to close properly?
DataFrame().to_excel(writer, sheet_name="Sheet2")
@pytest.mark.parametrize(
"mode,expected", [("w", ["baz"]), ("a", ["foo", "bar", "baz"])]
)
def test_write_append_mode(ext, mode, expected):
df = DataFrame([1], columns=["baz"])
with tm.ensure_clean(ext) as f:
wb = openpyxl.Workbook()
wb.worksheets[0].title = "foo"
wb.worksheets[0]["A1"].value = "foo"
wb.create_sheet("bar")
wb.worksheets[1]["A1"].value = "bar"
wb.save(f)
with ExcelWriter(f, engine="openpyxl", mode=mode) as writer:
df.to_excel(writer, sheet_name="baz", index=False)
with contextlib.closing(openpyxl.load_workbook(f)) as wb2:
result = [sheet.title for sheet in wb2.worksheets]
assert result == expected
for index, cell_value in enumerate(expected):
assert wb2.worksheets[index]["A1"].value == cell_value
@pytest.mark.parametrize(
"if_sheet_exists,num_sheets,expected",
[
("new", 2, ["apple", "banana"]),
("replace", 1, ["pear"]),
("overlay", 1, ["pear", "banana"]),
],
)
def test_if_sheet_exists_append_modes(ext, if_sheet_exists, num_sheets, expected):
# GH 40230
df1 = DataFrame({"fruit": ["apple", "banana"]})
df2 = DataFrame({"fruit": ["pear"]})
with tm.ensure_clean(ext) as f:
df1.to_excel(f, engine="openpyxl", sheet_name="foo", index=False)
with ExcelWriter(
f, engine="openpyxl", mode="a", if_sheet_exists=if_sheet_exists
) as writer:
df2.to_excel(writer, sheet_name="foo", index=False)
with contextlib.closing(openpyxl.load_workbook(f)) as wb:
assert len(wb.sheetnames) == num_sheets
assert wb.sheetnames[0] == "foo"
result = pd.read_excel(wb, "foo", engine="openpyxl")
assert list(result["fruit"]) == expected
if len(wb.sheetnames) == 2:
result = pd.read_excel(wb, wb.sheetnames[1], engine="openpyxl")
tm.assert_frame_equal(result, df2)
@pytest.mark.parametrize(
"startrow, startcol, greeting, goodbye",
[
(0, 0, ["poop", "world"], ["goodbye", "people"]),
(0, 1, ["hello", "world"], ["poop", "people"]),
(1, 0, ["hello", "poop"], ["goodbye", "people"]),
(1, 1, ["hello", "world"], ["goodbye", "poop"]),
],
)
def test_append_overlay_startrow_startcol(ext, startrow, startcol, greeting, goodbye):
df1 = DataFrame({"greeting": ["hello", "world"], "goodbye": ["goodbye", "people"]})
df2 = DataFrame(["poop"])
with tm.ensure_clean(ext) as f:
df1.to_excel(f, engine="openpyxl", sheet_name="poo", index=False)
with ExcelWriter(
f, engine="openpyxl", mode="a", if_sheet_exists="overlay"
) as writer:
# use startrow+1 because we don't have a header
df2.to_excel(
writer,
index=False,
header=False,
startrow=startrow + 1,
startcol=startcol,
sheet_name="poo",
)
result = pd.read_excel(f, sheet_name="poo", engine="openpyxl")
expected = DataFrame({"greeting": greeting, "goodbye": goodbye})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"if_sheet_exists,msg",
[
(
"invalid",
"'invalid' is not valid for if_sheet_exists. Valid options "
"are 'error', 'new', 'replace' and 'overlay'.",
),
(
"error",
"Sheet 'foo' already exists and if_sheet_exists is set to 'error'.",
),
(
None,
"Sheet 'foo' already exists and if_sheet_exists is set to 'error'.",
),
],
)
def test_if_sheet_exists_raises(ext, if_sheet_exists, msg):
# GH 40230
df = DataFrame({"fruit": ["pear"]})
with tm.ensure_clean(ext) as f:
with pytest.raises(ValueError, match=re.escape(msg)):
df.to_excel(f, "foo", engine="openpyxl")
with ExcelWriter(
f, engine="openpyxl", mode="a", if_sheet_exists=if_sheet_exists
) as writer:
df.to_excel(writer, sheet_name="foo")
def test_to_excel_with_openpyxl_engine(ext):
# GH 29854
with tm.ensure_clean(ext) as filename:
df1 = DataFrame({"A": np.linspace(1, 10, 10)})
df2 = DataFrame({"B": np.linspace(1, 20, 10)})
df = pd.concat([df1, df2], axis=1)
styled = df.style.map(
lambda val: f"color: {'red' if val < 0 else 'black'}"
).highlight_max()
styled.to_excel(filename, engine="openpyxl")
@pytest.mark.parametrize("read_only", [True, False])
def test_read_workbook(datapath, ext, read_only):
# GH 39528
filename = datapath("io", "data", "excel", "test1" + ext)
with contextlib.closing(
openpyxl.load_workbook(filename, read_only=read_only)
) as wb:
result = pd.read_excel(wb, engine="openpyxl")
expected = pd.read_excel(filename)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"header, expected_data",
[
(
0,
{
"Title": [np.nan, "A", 1, 2, 3],
"Unnamed: 1": [np.nan, "B", 4, 5, 6],
"Unnamed: 2": [np.nan, "C", 7, 8, 9],
},
),
(2, {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}),
],
)
@pytest.mark.parametrize(
"filename", ["dimension_missing", "dimension_small", "dimension_large"]
)
# When read_only is None, use read_excel instead of a workbook
@pytest.mark.parametrize("read_only", [True, False, None])
def test_read_with_bad_dimension(
datapath, ext, header, expected_data, filename, read_only
):
# GH 38956, 39001 - no/incorrect dimension information
path = datapath("io", "data", "excel", f"{filename}{ext}")
if read_only is None:
result = pd.read_excel(path, header=header)
else:
with contextlib.closing(
openpyxl.load_workbook(path, read_only=read_only)
) as wb:
result = pd.read_excel(wb, engine="openpyxl", header=header)
expected = DataFrame(expected_data)
tm.assert_frame_equal(result, expected)
def test_append_mode_file(ext):
# GH 39576
df = DataFrame()
with tm.ensure_clean(ext) as f:
df.to_excel(f, engine="openpyxl")
with ExcelWriter(
f, mode="a", engine="openpyxl", if_sheet_exists="new"
) as writer:
df.to_excel(writer)
# make sure that zip files are not concatenated by making sure that
# "docProps/app.xml" only occurs twice in the file
data = Path(f).read_bytes()
first = data.find(b"docProps/app.xml")
second = data.find(b"docProps/app.xml", first + 1)
third = data.find(b"docProps/app.xml", second + 1)
assert second != -1 and third == -1
# When read_only is None, use read_excel instead of a workbook
@pytest.mark.parametrize("read_only", [True, False, None])
def test_read_with_empty_trailing_rows(datapath, ext, read_only):
# GH 39181
path = datapath("io", "data", "excel", f"empty_trailing_rows{ext}")
if read_only is None:
result = pd.read_excel(path)
else:
with contextlib.closing(
openpyxl.load_workbook(path, read_only=read_only)
) as wb:
result = pd.read_excel(wb, engine="openpyxl")
expected = DataFrame(
{
"Title": [np.nan, "A", 1, 2, 3],
"Unnamed: 1": [np.nan, "B", 4, 5, 6],
"Unnamed: 2": [np.nan, "C", 7, 8, 9],
}
)
tm.assert_frame_equal(result, expected)
# When read_only is None, use read_excel instead of a workbook
@pytest.mark.parametrize("read_only", [True, False, None])
def test_read_empty_with_blank_row(datapath, ext, read_only):
# GH 39547 - empty excel file with a row that has no data
path = datapath("io", "data", "excel", f"empty_with_blank_row{ext}")
if read_only is None:
result = pd.read_excel(path)
else:
with contextlib.closing(
openpyxl.load_workbook(path, read_only=read_only)
) as wb:
result = pd.read_excel(wb, engine="openpyxl")
expected = DataFrame()
tm.assert_frame_equal(result, expected)
def test_book_and_sheets_consistent(ext):
# GH#45687 - Ensure sheets is updated if user modifies book
with tm.ensure_clean(ext) as f:
with ExcelWriter(f, engine="openpyxl") as writer:
assert writer.sheets == {}
sheet = writer.book.create_sheet("test_name", 0)
assert writer.sheets == {"test_name": sheet}
def test_ints_spelled_with_decimals(datapath, ext):
# GH 46988 - openpyxl returns this sheet with floats
path = datapath("io", "data", "excel", f"ints_spelled_with_decimals{ext}")
result = pd.read_excel(path)
expected = DataFrame(range(2, 12), columns=[1])
tm.assert_frame_equal(result, expected)
def test_read_multiindex_header_no_index_names(datapath, ext):
# GH#47487
path = datapath("io", "data", "excel", f"multiindex_no_index_names{ext}")
result = pd.read_excel(path, index_col=[0, 1, 2], header=[0, 1, 2])
expected = DataFrame(
[[np.nan, "x", "x", "x"], ["x", np.nan, np.nan, np.nan]],
columns=pd.MultiIndex.from_tuples(
[("X", "Y", "A1"), ("X", "Y", "A2"), ("XX", "YY", "B1"), ("XX", "YY", "B2")]
),
index=pd.MultiIndex.from_tuples([("A", "AA", "AAA"), ("A", "BB", "BBB")]),
)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,294 @@
import contextlib
import time
import numpy as np
import pytest
import pandas.util._test_decorators as td
from pandas import (
DataFrame,
read_excel,
)
import pandas._testing as tm
from pandas.io.excel import ExcelWriter
from pandas.io.formats.excel import ExcelFormatter
pytest.importorskip("jinja2")
# jinja2 is currently required for Styler.__init__(). Technically Styler.to_excel
# could compute styles and render to excel without jinja2, since there is no
# 'template' file, but this needs the import error to delayed until render time.
def assert_equal_cell_styles(cell1, cell2):
# TODO: should find a better way to check equality
assert cell1.alignment.__dict__ == cell2.alignment.__dict__
assert cell1.border.__dict__ == cell2.border.__dict__
assert cell1.fill.__dict__ == cell2.fill.__dict__
assert cell1.font.__dict__ == cell2.font.__dict__
assert cell1.number_format == cell2.number_format
assert cell1.protection.__dict__ == cell2.protection.__dict__
@pytest.mark.parametrize(
"engine",
["xlsxwriter", "openpyxl"],
)
def test_styler_to_excel_unstyled(engine):
# compare DataFrame.to_excel and Styler.to_excel when no styles applied
pytest.importorskip(engine)
df = DataFrame(np.random.default_rng(2).standard_normal((2, 2)))
with tm.ensure_clean(".xlsx") as path:
with ExcelWriter(path, engine=engine) as writer:
df.to_excel(writer, sheet_name="dataframe")
df.style.to_excel(writer, sheet_name="unstyled")
openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl
with contextlib.closing(openpyxl.load_workbook(path)) as wb:
for col1, col2 in zip(wb["dataframe"].columns, wb["unstyled"].columns):
assert len(col1) == len(col2)
for cell1, cell2 in zip(col1, col2):
assert cell1.value == cell2.value
assert_equal_cell_styles(cell1, cell2)
shared_style_params = [
(
"background-color: #111222",
["fill", "fgColor", "rgb"],
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
),
(
"color: #111222",
["font", "color", "value"],
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
),
("font-family: Arial;", ["font", "name"], "arial"),
("font-weight: bold;", ["font", "b"], True),
("font-style: italic;", ["font", "i"], True),
("text-decoration: underline;", ["font", "u"], "single"),
("number-format: $??,???.00;", ["number_format"], "$??,???.00"),
("text-align: left;", ["alignment", "horizontal"], "left"),
(
"vertical-align: bottom;",
["alignment", "vertical"],
{"xlsxwriter": None, "openpyxl": "bottom"}, # xlsxwriter Fails
),
("vertical-align: middle;", ["alignment", "vertical"], "center"),
# Border widths
("border-left: 2pt solid red", ["border", "left", "style"], "medium"),
("border-left: 1pt dotted red", ["border", "left", "style"], "dotted"),
("border-left: 2pt dotted red", ["border", "left", "style"], "mediumDashDotDot"),
("border-left: 1pt dashed red", ["border", "left", "style"], "dashed"),
("border-left: 2pt dashed red", ["border", "left", "style"], "mediumDashed"),
("border-left: 1pt solid red", ["border", "left", "style"], "thin"),
("border-left: 3pt solid red", ["border", "left", "style"], "thick"),
# Border expansion
(
"border-left: 2pt solid #111222",
["border", "left", "color", "rgb"],
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
),
("border: 1pt solid red", ["border", "top", "style"], "thin"),
(
"border: 1pt solid #111222",
["border", "top", "color", "rgb"],
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
),
("border: 1pt solid red", ["border", "right", "style"], "thin"),
(
"border: 1pt solid #111222",
["border", "right", "color", "rgb"],
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
),
("border: 1pt solid red", ["border", "bottom", "style"], "thin"),
(
"border: 1pt solid #111222",
["border", "bottom", "color", "rgb"],
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
),
("border: 1pt solid red", ["border", "left", "style"], "thin"),
(
"border: 1pt solid #111222",
["border", "left", "color", "rgb"],
{"xlsxwriter": "FF111222", "openpyxl": "00111222"},
),
# Border styles
(
"border-left-style: hair; border-left-color: black",
["border", "left", "style"],
"hair",
),
]
@pytest.mark.parametrize(
"engine",
["xlsxwriter", "openpyxl"],
)
@pytest.mark.parametrize("css, attrs, expected", shared_style_params)
def test_styler_to_excel_basic(engine, css, attrs, expected):
pytest.importorskip(engine)
df = DataFrame(np.random.default_rng(2).standard_normal((1, 1)))
styler = df.style.map(lambda x: css)
with tm.ensure_clean(".xlsx") as path:
with ExcelWriter(path, engine=engine) as writer:
df.to_excel(writer, sheet_name="dataframe")
styler.to_excel(writer, sheet_name="styled")
openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl
with contextlib.closing(openpyxl.load_workbook(path)) as wb:
# test unstyled data cell does not have expected styles
# test styled cell has expected styles
u_cell, s_cell = wb["dataframe"].cell(2, 2), wb["styled"].cell(2, 2)
for attr in attrs:
u_cell, s_cell = getattr(u_cell, attr, None), getattr(s_cell, attr)
if isinstance(expected, dict):
assert u_cell is None or u_cell != expected[engine]
assert s_cell == expected[engine]
else:
assert u_cell is None or u_cell != expected
assert s_cell == expected
@pytest.mark.parametrize(
"engine",
["xlsxwriter", "openpyxl"],
)
@pytest.mark.parametrize("css, attrs, expected", shared_style_params)
def test_styler_to_excel_basic_indexes(engine, css, attrs, expected):
pytest.importorskip(engine)
df = DataFrame(np.random.default_rng(2).standard_normal((1, 1)))
styler = df.style
styler.map_index(lambda x: css, axis=0)
styler.map_index(lambda x: css, axis=1)
null_styler = df.style
null_styler.map(lambda x: "null: css;")
null_styler.map_index(lambda x: "null: css;", axis=0)
null_styler.map_index(lambda x: "null: css;", axis=1)
with tm.ensure_clean(".xlsx") as path:
with ExcelWriter(path, engine=engine) as writer:
null_styler.to_excel(writer, sheet_name="null_styled")
styler.to_excel(writer, sheet_name="styled")
openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl
with contextlib.closing(openpyxl.load_workbook(path)) as wb:
# test null styled index cells does not have expected styles
# test styled cell has expected styles
ui_cell, si_cell = wb["null_styled"].cell(2, 1), wb["styled"].cell(2, 1)
uc_cell, sc_cell = wb["null_styled"].cell(1, 2), wb["styled"].cell(1, 2)
for attr in attrs:
ui_cell, si_cell = getattr(ui_cell, attr, None), getattr(si_cell, attr)
uc_cell, sc_cell = getattr(uc_cell, attr, None), getattr(sc_cell, attr)
if isinstance(expected, dict):
assert ui_cell is None or ui_cell != expected[engine]
assert si_cell == expected[engine]
assert uc_cell is None or uc_cell != expected[engine]
assert sc_cell == expected[engine]
else:
assert ui_cell is None or ui_cell != expected
assert si_cell == expected
assert uc_cell is None or uc_cell != expected
assert sc_cell == expected
# From https://openpyxl.readthedocs.io/en/stable/api/openpyxl.styles.borders.html
# Note: Leaving behavior of "width"-type styles undefined; user should use border-width
# instead
excel_border_styles = [
# "thin",
"dashed",
"mediumDashDot",
"dashDotDot",
"hair",
"dotted",
"mediumDashDotDot",
# "medium",
"double",
"dashDot",
"slantDashDot",
# "thick",
"mediumDashed",
]
@pytest.mark.parametrize(
"engine",
["xlsxwriter", "openpyxl"],
)
@pytest.mark.parametrize("border_style", excel_border_styles)
def test_styler_to_excel_border_style(engine, border_style):
css = f"border-left: {border_style} black thin"
attrs = ["border", "left", "style"]
expected = border_style
pytest.importorskip(engine)
df = DataFrame(np.random.default_rng(2).standard_normal((1, 1)))
styler = df.style.map(lambda x: css)
with tm.ensure_clean(".xlsx") as path:
with ExcelWriter(path, engine=engine) as writer:
df.to_excel(writer, sheet_name="dataframe")
styler.to_excel(writer, sheet_name="styled")
openpyxl = pytest.importorskip("openpyxl") # test loading only with openpyxl
with contextlib.closing(openpyxl.load_workbook(path)) as wb:
# test unstyled data cell does not have expected styles
# test styled cell has expected styles
u_cell, s_cell = wb["dataframe"].cell(2, 2), wb["styled"].cell(2, 2)
for attr in attrs:
u_cell, s_cell = getattr(u_cell, attr, None), getattr(s_cell, attr)
if isinstance(expected, dict):
assert u_cell is None or u_cell != expected[engine]
assert s_cell == expected[engine]
else:
assert u_cell is None or u_cell != expected
assert s_cell == expected
def test_styler_custom_converter():
openpyxl = pytest.importorskip("openpyxl")
def custom_converter(css):
return {"font": {"color": {"rgb": "111222"}}}
df = DataFrame(np.random.default_rng(2).standard_normal((1, 1)))
styler = df.style.map(lambda x: "color: #888999")
with tm.ensure_clean(".xlsx") as path:
with ExcelWriter(path, engine="openpyxl") as writer:
ExcelFormatter(styler, style_converter=custom_converter).write(
writer, sheet_name="custom"
)
with contextlib.closing(openpyxl.load_workbook(path)) as wb:
assert wb["custom"].cell(2, 2).font.color.value == "00111222"
@pytest.mark.single_cpu
@td.skip_if_not_us_locale
def test_styler_to_s3(s3_public_bucket, s3so):
# GH#46381
mock_bucket_name, target_file = s3_public_bucket.name, "test.xlsx"
df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]})
styler = df.style.set_sticky(axis="index")
styler.to_excel(f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so)
timeout = 5
while True:
if target_file in (obj.key for obj in s3_public_bucket.objects.all()):
break
time.sleep(0.1)
timeout -= 0.1
assert timeout > 0, "Timed out waiting for file to appear on moto"
result = read_excel(
f"s3://{mock_bucket_name}/{target_file}", index_col=0, storage_options=s3so
)
tm.assert_frame_equal(result, df)

View File

@@ -0,0 +1,59 @@
import io
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.io.excel import ExcelFile
from pandas.io.excel._base import inspect_excel_format
xlrd = pytest.importorskip("xlrd")
@pytest.fixture(params=[".xls"])
def read_ext_xlrd(request):
"""
Valid extensions for reading Excel files with xlrd.
Similar to read_ext, but excludes .ods, .xlsb, and for xlrd>2 .xlsx, .xlsm
"""
return request.param
def test_read_xlrd_book(read_ext_xlrd, datapath):
engine = "xlrd"
sheet_name = "Sheet1"
pth = datapath("io", "data", "excel", "test1.xls")
with xlrd.open_workbook(pth) as book:
with ExcelFile(book, engine=engine) as xl:
result = pd.read_excel(xl, sheet_name=sheet_name, index_col=0)
expected = pd.read_excel(
book, sheet_name=sheet_name, engine=engine, index_col=0
)
tm.assert_frame_equal(result, expected)
def test_read_xlsx_fails(datapath):
# GH 29375
from xlrd.biffh import XLRDError
path = datapath("io", "data", "excel", "test1.xlsx")
with pytest.raises(XLRDError, match="Excel xlsx file; not supported"):
pd.read_excel(path, engine="xlrd")
@pytest.mark.parametrize(
"file_header",
[
b"\x09\x00\x04\x00\x07\x00\x10\x00",
b"\x09\x02\x06\x00\x00\x00\x10\x00",
b"\x09\x04\x06\x00\x00\x00\x10\x00",
b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1",
],
)
def test_read_old_xls_files(file_header):
# GH 41226
f = io.BytesIO(file_header)
assert inspect_excel_format(f) == "xls"

View File

@@ -0,0 +1,78 @@
import contextlib
import pytest
from pandas import DataFrame
import pandas._testing as tm
from pandas.io.excel import ExcelWriter
xlsxwriter = pytest.importorskip("xlsxwriter")
pytestmark = pytest.mark.parametrize("ext", [".xlsx"])
def test_column_format(ext):
# Test that column formats are applied to cells. Test for issue #9167.
# Applicable to xlsxwriter only.
openpyxl = pytest.importorskip("openpyxl")
with tm.ensure_clean(ext) as path:
frame = DataFrame({"A": [123456, 123456], "B": [123456, 123456]})
with ExcelWriter(path) as writer:
frame.to_excel(writer)
# Add a number format to col B and ensure it is applied to cells.
num_format = "#,##0"
write_workbook = writer.book
write_worksheet = write_workbook.worksheets()[0]
col_format = write_workbook.add_format({"num_format": num_format})
write_worksheet.set_column("B:B", None, col_format)
with contextlib.closing(openpyxl.load_workbook(path)) as read_workbook:
try:
read_worksheet = read_workbook["Sheet1"]
except TypeError:
# compat
read_worksheet = read_workbook.get_sheet_by_name(name="Sheet1")
# Get the number format from the cell.
try:
cell = read_worksheet["B2"]
except TypeError:
# compat
cell = read_worksheet.cell("B2")
try:
read_num_format = cell.number_format
except AttributeError:
read_num_format = cell.style.number_format._format_code
assert read_num_format == num_format
def test_write_append_mode_raises(ext):
msg = "Append mode is not supported with xlsxwriter!"
with tm.ensure_clean(ext) as f:
with pytest.raises(ValueError, match=msg):
ExcelWriter(f, engine="xlsxwriter", mode="a")
@pytest.mark.parametrize("nan_inf_to_errors", [True, False])
def test_engine_kwargs(ext, nan_inf_to_errors):
# GH 42286
engine_kwargs = {"options": {"nan_inf_to_errors": nan_inf_to_errors}}
with tm.ensure_clean(ext) as f:
with ExcelWriter(f, engine="xlsxwriter", engine_kwargs=engine_kwargs) as writer:
assert writer.book.nan_inf_to_errors == nan_inf_to_errors
def test_book_and_sheets_consistent(ext):
# GH#45687 - Ensure sheets is updated if user modifies book
with tm.ensure_clean(ext) as f:
with ExcelWriter(f, engine="xlsxwriter") as writer:
assert writer.sheets == {}
sheet = writer.book.add_worksheet("test_name")
assert writer.sheets == {"test_name": sheet}

View File

@@ -0,0 +1,307 @@
import numpy as np
import pytest
from pandas import DataFrame
pytest.importorskip("jinja2")
def bar_grad(a=None, b=None, c=None, d=None):
"""Used in multiple tests to simplify formatting of expected result"""
ret = [("width", "10em")]
if all(x is None for x in [a, b, c, d]):
return ret
return ret + [
(
"background",
f"linear-gradient(90deg,{','.join([x for x in [a, b, c, d] if x])})",
)
]
def no_bar():
return bar_grad()
def bar_to(x, color="#d65f5f"):
return bar_grad(f" {color} {x:.1f}%", f" transparent {x:.1f}%")
def bar_from_to(x, y, color="#d65f5f"):
return bar_grad(
f" transparent {x:.1f}%",
f" {color} {x:.1f}%",
f" {color} {y:.1f}%",
f" transparent {y:.1f}%",
)
@pytest.fixture
def df_pos():
return DataFrame([[1], [2], [3]])
@pytest.fixture
def df_neg():
return DataFrame([[-1], [-2], [-3]])
@pytest.fixture
def df_mix():
return DataFrame([[-3], [1], [2]])
@pytest.mark.parametrize(
"align, exp",
[
("left", [no_bar(), bar_to(50), bar_to(100)]),
("right", [bar_to(100), bar_from_to(50, 100), no_bar()]),
("mid", [bar_to(33.33), bar_to(66.66), bar_to(100)]),
("zero", [bar_from_to(50, 66.7), bar_from_to(50, 83.3), bar_from_to(50, 100)]),
("mean", [bar_to(50), no_bar(), bar_from_to(50, 100)]),
(2.0, [bar_to(50), no_bar(), bar_from_to(50, 100)]),
(np.median, [bar_to(50), no_bar(), bar_from_to(50, 100)]),
],
)
def test_align_positive_cases(df_pos, align, exp):
# test different align cases for all positive values
result = df_pos.style.bar(align=align)._compute().ctx
expected = {(0, 0): exp[0], (1, 0): exp[1], (2, 0): exp[2]}
assert result == expected
@pytest.mark.parametrize(
"align, exp",
[
("left", [bar_to(100), bar_to(50), no_bar()]),
("right", [no_bar(), bar_from_to(50, 100), bar_to(100)]),
("mid", [bar_from_to(66.66, 100), bar_from_to(33.33, 100), bar_to(100)]),
("zero", [bar_from_to(33.33, 50), bar_from_to(16.66, 50), bar_to(50)]),
("mean", [bar_from_to(50, 100), no_bar(), bar_to(50)]),
(-2.0, [bar_from_to(50, 100), no_bar(), bar_to(50)]),
(np.median, [bar_from_to(50, 100), no_bar(), bar_to(50)]),
],
)
def test_align_negative_cases(df_neg, align, exp):
# test different align cases for all negative values
result = df_neg.style.bar(align=align)._compute().ctx
expected = {(0, 0): exp[0], (1, 0): exp[1], (2, 0): exp[2]}
assert result == expected
@pytest.mark.parametrize(
"align, exp",
[
("left", [no_bar(), bar_to(80), bar_to(100)]),
("right", [bar_to(100), bar_from_to(80, 100), no_bar()]),
("mid", [bar_to(60), bar_from_to(60, 80), bar_from_to(60, 100)]),
("zero", [bar_to(50), bar_from_to(50, 66.66), bar_from_to(50, 83.33)]),
("mean", [bar_to(50), bar_from_to(50, 66.66), bar_from_to(50, 83.33)]),
(-0.0, [bar_to(50), bar_from_to(50, 66.66), bar_from_to(50, 83.33)]),
(np.nanmedian, [bar_to(50), no_bar(), bar_from_to(50, 62.5)]),
],
)
@pytest.mark.parametrize("nans", [True, False])
def test_align_mixed_cases(df_mix, align, exp, nans):
# test different align cases for mixed positive and negative values
# also test no impact of NaNs and no_bar
expected = {(0, 0): exp[0], (1, 0): exp[1], (2, 0): exp[2]}
if nans:
df_mix.loc[3, :] = np.nan
expected.update({(3, 0): no_bar()})
result = df_mix.style.bar(align=align)._compute().ctx
assert result == expected
@pytest.mark.parametrize(
"align, exp",
[
(
"left",
{
"index": [[no_bar(), no_bar()], [bar_to(100), bar_to(100)]],
"columns": [[no_bar(), bar_to(100)], [no_bar(), bar_to(100)]],
"none": [[no_bar(), bar_to(33.33)], [bar_to(66.66), bar_to(100)]],
},
),
(
"mid",
{
"index": [[bar_to(33.33), bar_to(50)], [bar_to(100), bar_to(100)]],
"columns": [[bar_to(50), bar_to(100)], [bar_to(75), bar_to(100)]],
"none": [[bar_to(25), bar_to(50)], [bar_to(75), bar_to(100)]],
},
),
(
"zero",
{
"index": [
[bar_from_to(50, 66.66), bar_from_to(50, 75)],
[bar_from_to(50, 100), bar_from_to(50, 100)],
],
"columns": [
[bar_from_to(50, 75), bar_from_to(50, 100)],
[bar_from_to(50, 87.5), bar_from_to(50, 100)],
],
"none": [
[bar_from_to(50, 62.5), bar_from_to(50, 75)],
[bar_from_to(50, 87.5), bar_from_to(50, 100)],
],
},
),
(
2,
{
"index": [
[bar_to(50), no_bar()],
[bar_from_to(50, 100), bar_from_to(50, 100)],
],
"columns": [
[bar_to(50), no_bar()],
[bar_from_to(50, 75), bar_from_to(50, 100)],
],
"none": [
[bar_from_to(25, 50), no_bar()],
[bar_from_to(50, 75), bar_from_to(50, 100)],
],
},
),
],
)
@pytest.mark.parametrize("axis", ["index", "columns", "none"])
def test_align_axis(align, exp, axis):
# test all axis combinations with positive values and different aligns
data = DataFrame([[1, 2], [3, 4]])
result = (
data.style.bar(align=align, axis=None if axis == "none" else axis)
._compute()
.ctx
)
expected = {
(0, 0): exp[axis][0][0],
(0, 1): exp[axis][0][1],
(1, 0): exp[axis][1][0],
(1, 1): exp[axis][1][1],
}
assert result == expected
@pytest.mark.parametrize(
"values, vmin, vmax",
[
("positive", 1.5, 2.5),
("negative", -2.5, -1.5),
("mixed", -2.5, 1.5),
],
)
@pytest.mark.parametrize("nullify", [None, "vmin", "vmax"]) # test min/max separately
@pytest.mark.parametrize("align", ["left", "right", "zero", "mid"])
def test_vmin_vmax_clipping(df_pos, df_neg, df_mix, values, vmin, vmax, nullify, align):
# test that clipping occurs if any vmin > data_values or vmax < data_values
if align == "mid": # mid acts as left or right in each case
if values == "positive":
align = "left"
elif values == "negative":
align = "right"
df = {"positive": df_pos, "negative": df_neg, "mixed": df_mix}[values]
vmin = None if nullify == "vmin" else vmin
vmax = None if nullify == "vmax" else vmax
clip_df = df.where(df <= (vmax if vmax else 999), other=vmax)
clip_df = clip_df.where(clip_df >= (vmin if vmin else -999), other=vmin)
result = (
df.style.bar(align=align, vmin=vmin, vmax=vmax, color=["red", "green"])
._compute()
.ctx
)
expected = clip_df.style.bar(align=align, color=["red", "green"])._compute().ctx
assert result == expected
@pytest.mark.parametrize(
"values, vmin, vmax",
[
("positive", 0.5, 4.5),
("negative", -4.5, -0.5),
("mixed", -4.5, 4.5),
],
)
@pytest.mark.parametrize("nullify", [None, "vmin", "vmax"]) # test min/max separately
@pytest.mark.parametrize("align", ["left", "right", "zero", "mid"])
def test_vmin_vmax_widening(df_pos, df_neg, df_mix, values, vmin, vmax, nullify, align):
# test that widening occurs if any vmax > data_values or vmin < data_values
if align == "mid": # mid acts as left or right in each case
if values == "positive":
align = "left"
elif values == "negative":
align = "right"
df = {"positive": df_pos, "negative": df_neg, "mixed": df_mix}[values]
vmin = None if nullify == "vmin" else vmin
vmax = None if nullify == "vmax" else vmax
expand_df = df.copy()
expand_df.loc[3, :], expand_df.loc[4, :] = vmin, vmax
result = (
df.style.bar(align=align, vmin=vmin, vmax=vmax, color=["red", "green"])
._compute()
.ctx
)
expected = expand_df.style.bar(align=align, color=["red", "green"])._compute().ctx
assert result.items() <= expected.items()
def test_numerics():
# test data is pre-selected for numeric values
data = DataFrame([[1, "a"], [2, "b"]])
result = data.style.bar()._compute().ctx
assert (0, 1) not in result
assert (1, 1) not in result
@pytest.mark.parametrize(
"align, exp",
[
("left", [no_bar(), bar_to(100, "green")]),
("right", [bar_to(100, "red"), no_bar()]),
("mid", [bar_to(25, "red"), bar_from_to(25, 100, "green")]),
("zero", [bar_from_to(33.33, 50, "red"), bar_from_to(50, 100, "green")]),
],
)
def test_colors_mixed(align, exp):
data = DataFrame([[-1], [3]])
result = data.style.bar(align=align, color=["red", "green"])._compute().ctx
assert result == {(0, 0): exp[0], (1, 0): exp[1]}
def test_bar_align_height():
# test when keyword height is used 'no-repeat center' and 'background-size' present
data = DataFrame([[1], [2]])
result = data.style.bar(align="left", height=50)._compute().ctx
bg_s = "linear-gradient(90deg, #d65f5f 100.0%, transparent 100.0%) no-repeat center"
expected = {
(0, 0): [("width", "10em")],
(1, 0): [
("width", "10em"),
("background", bg_s),
("background-size", "100% 50.0%"),
],
}
assert result == expected
def test_bar_value_error_raises():
df = DataFrame({"A": [-100, -60, -30, -20]})
msg = "`align` should be in {'left', 'right', 'mid', 'mean', 'zero'} or"
with pytest.raises(ValueError, match=msg):
df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"]).to_html()
msg = r"`width` must be a value in \[0, 100\]"
with pytest.raises(ValueError, match=msg):
df.style.bar(width=200).to_html()
msg = r"`height` must be a value in \[0, 100\]"
with pytest.raises(ValueError, match=msg):
df.style.bar(height=200).to_html()

View File

@@ -0,0 +1,44 @@
import pytest
jinja2 = pytest.importorskip("jinja2")
from pandas import (
DataFrame,
MultiIndex,
)
from pandas.io.formats.style import Styler
@pytest.fixture
def df():
return DataFrame(
data=[[0, -0.609], [1, -1.228]],
columns=["A", "B"],
index=["x", "y"],
)
@pytest.fixture
def styler(df):
return Styler(df, uuid_len=0)
def test_concat_bad_columns(styler):
msg = "`other.data` must have same columns as `Styler.data"
with pytest.raises(ValueError, match=msg):
styler.concat(DataFrame([[1, 2]]).style)
def test_concat_bad_type(styler):
msg = "`other` must be of type `Styler`"
with pytest.raises(TypeError, match=msg):
styler.concat(DataFrame([[1, 2]]))
def test_concat_bad_index_levels(styler, df):
df = df.copy()
df.index = MultiIndex.from_tuples([(0, 0), (1, 1)])
msg = "number of index levels must be same in `other`"
with pytest.raises(ValueError, match=msg):
styler.concat(df.style)

View File

@@ -0,0 +1,562 @@
import numpy as np
import pytest
from pandas import (
NA,
DataFrame,
IndexSlice,
MultiIndex,
NaT,
Timestamp,
option_context,
)
pytest.importorskip("jinja2")
from pandas.io.formats.style import Styler
from pandas.io.formats.style_render import _str_escape
@pytest.fixture
def df():
return DataFrame(
data=[[0, -0.609], [1, -1.228]],
columns=["A", "B"],
index=["x", "y"],
)
@pytest.fixture
def styler(df):
return Styler(df, uuid_len=0)
@pytest.fixture
def df_multi():
return DataFrame(
data=np.arange(16).reshape(4, 4),
columns=MultiIndex.from_product([["A", "B"], ["a", "b"]]),
index=MultiIndex.from_product([["X", "Y"], ["x", "y"]]),
)
@pytest.fixture
def styler_multi(df_multi):
return Styler(df_multi, uuid_len=0)
def test_display_format(styler):
ctx = styler.format("{:0.1f}")._translate(True, True)
assert all(["display_value" in c for c in row] for row in ctx["body"])
assert all([len(c["display_value"]) <= 3 for c in row[1:]] for row in ctx["body"])
assert len(ctx["body"][0][1]["display_value"].lstrip("-")) <= 3
@pytest.mark.parametrize("index", [True, False])
@pytest.mark.parametrize("columns", [True, False])
def test_display_format_index(styler, index, columns):
exp_index = ["x", "y"]
if index:
styler.format_index(lambda v: v.upper(), axis=0) # test callable
exp_index = ["X", "Y"]
exp_columns = ["A", "B"]
if columns:
styler.format_index("*{}*", axis=1) # test string
exp_columns = ["*A*", "*B*"]
ctx = styler._translate(True, True)
for r, row in enumerate(ctx["body"]):
assert row[0]["display_value"] == exp_index[r]
for c, col in enumerate(ctx["head"][1:]):
assert col["display_value"] == exp_columns[c]
def test_format_dict(styler):
ctx = styler.format({"A": "{:0.1f}", "B": "{0:.2%}"})._translate(True, True)
assert ctx["body"][0][1]["display_value"] == "0.0"
assert ctx["body"][0][2]["display_value"] == "-60.90%"
def test_format_index_dict(styler):
ctx = styler.format_index({0: lambda v: v.upper()})._translate(True, True)
for i, val in enumerate(["X", "Y"]):
assert ctx["body"][i][0]["display_value"] == val
def test_format_string(styler):
ctx = styler.format("{:.2f}")._translate(True, True)
assert ctx["body"][0][1]["display_value"] == "0.00"
assert ctx["body"][0][2]["display_value"] == "-0.61"
assert ctx["body"][1][1]["display_value"] == "1.00"
assert ctx["body"][1][2]["display_value"] == "-1.23"
def test_format_callable(styler):
ctx = styler.format(lambda v: "neg" if v < 0 else "pos")._translate(True, True)
assert ctx["body"][0][1]["display_value"] == "pos"
assert ctx["body"][0][2]["display_value"] == "neg"
assert ctx["body"][1][1]["display_value"] == "pos"
assert ctx["body"][1][2]["display_value"] == "neg"
def test_format_with_na_rep():
# GH 21527 28358
df = DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"])
ctx = df.style.format(None, na_rep="-")._translate(True, True)
assert ctx["body"][0][1]["display_value"] == "-"
assert ctx["body"][0][2]["display_value"] == "-"
ctx = df.style.format("{:.2%}", na_rep="-")._translate(True, True)
assert ctx["body"][0][1]["display_value"] == "-"
assert ctx["body"][0][2]["display_value"] == "-"
assert ctx["body"][1][1]["display_value"] == "110.00%"
assert ctx["body"][1][2]["display_value"] == "120.00%"
ctx = df.style.format("{:.2%}", na_rep="-", subset=["B"])._translate(True, True)
assert ctx["body"][0][2]["display_value"] == "-"
assert ctx["body"][1][2]["display_value"] == "120.00%"
def test_format_index_with_na_rep():
df = DataFrame([[1, 2, 3, 4, 5]], columns=["A", None, np.nan, NaT, NA])
ctx = df.style.format_index(None, na_rep="--", axis=1)._translate(True, True)
assert ctx["head"][0][1]["display_value"] == "A"
for i in [2, 3, 4, 5]:
assert ctx["head"][0][i]["display_value"] == "--"
def test_format_non_numeric_na():
# GH 21527 28358
df = DataFrame(
{
"object": [None, np.nan, "foo"],
"datetime": [None, NaT, Timestamp("20120101")],
}
)
ctx = df.style.format(None, na_rep="-")._translate(True, True)
assert ctx["body"][0][1]["display_value"] == "-"
assert ctx["body"][0][2]["display_value"] == "-"
assert ctx["body"][1][1]["display_value"] == "-"
assert ctx["body"][1][2]["display_value"] == "-"
@pytest.mark.parametrize(
"func, attr, kwargs",
[
("format", "_display_funcs", {}),
("format_index", "_display_funcs_index", {"axis": 0}),
("format_index", "_display_funcs_columns", {"axis": 1}),
],
)
def test_format_clear(styler, func, attr, kwargs):
assert (0, 0) not in getattr(styler, attr) # using default
getattr(styler, func)("{:.2f}", **kwargs)
assert (0, 0) in getattr(styler, attr) # formatter is specified
getattr(styler, func)(**kwargs)
assert (0, 0) not in getattr(styler, attr) # formatter cleared to default
@pytest.mark.parametrize(
"escape, exp",
[
("html", "&lt;&gt;&amp;&#34;%$#_{}~^\\~ ^ \\ "),
(
"latex",
'<>\\&"\\%\\$\\#\\_\\{\\}\\textasciitilde \\textasciicircum '
"\\textbackslash \\textasciitilde \\space \\textasciicircum \\space "
"\\textbackslash \\space ",
),
],
)
def test_format_escape_html(escape, exp):
chars = '<>&"%$#_{}~^\\~ ^ \\ '
df = DataFrame([[chars]])
s = Styler(df, uuid_len=0).format("&{0}&", escape=None)
expected = f'<td id="T__row0_col0" class="data row0 col0" >&{chars}&</td>'
assert expected in s.to_html()
# only the value should be escaped before passing to the formatter
s = Styler(df, uuid_len=0).format("&{0}&", escape=escape)
expected = f'<td id="T__row0_col0" class="data row0 col0" >&{exp}&</td>'
assert expected in s.to_html()
# also test format_index()
styler = Styler(DataFrame(columns=[chars]), uuid_len=0)
styler.format_index("&{0}&", escape=None, axis=1)
assert styler._translate(True, True)["head"][0][1]["display_value"] == f"&{chars}&"
styler.format_index("&{0}&", escape=escape, axis=1)
assert styler._translate(True, True)["head"][0][1]["display_value"] == f"&{exp}&"
@pytest.mark.parametrize(
"chars, expected",
[
(
r"$ \$&%#_{}~^\ $ &%#_{}~^\ $",
"".join(
[
r"$ \$&%#_{}~^\ $ ",
r"\&\%\#\_\{\}\textasciitilde \textasciicircum ",
r"\textbackslash \space \$",
]
),
),
(
r"\( &%#_{}~^\ \) &%#_{}~^\ \(",
"".join(
[
r"\( &%#_{}~^\ \) ",
r"\&\%\#\_\{\}\textasciitilde \textasciicircum ",
r"\textbackslash \space \textbackslash (",
]
),
),
(
r"$\&%#_{}^\$",
r"\$\textbackslash \&\%\#\_\{\}\textasciicircum \textbackslash \$",
),
(
r"$ \frac{1}{2} $ \( \frac{1}{2} \)",
"".join(
[
r"$ \frac{1}{2} $",
r" \textbackslash ( \textbackslash frac\{1\}\{2\} \textbackslash )",
]
),
),
],
)
def test_format_escape_latex_math(chars, expected):
# GH 51903
# latex-math escape works for each DataFrame cell separately. If we have
# a combination of dollar signs and brackets, the dollar sign would apply.
df = DataFrame([[chars]])
s = df.style.format("{0}", escape="latex-math")
assert s._translate(True, True)["body"][0][1]["display_value"] == expected
def test_format_escape_na_rep():
# tests the na_rep is not escaped
df = DataFrame([['<>&"', None]])
s = Styler(df, uuid_len=0).format("X&{0}>X", escape="html", na_rep="&")
ex = '<td id="T__row0_col0" class="data row0 col0" >X&&lt;&gt;&amp;&#34;>X</td>'
expected2 = '<td id="T__row0_col1" class="data row0 col1" >&</td>'
assert ex in s.to_html()
assert expected2 in s.to_html()
# also test for format_index()
df = DataFrame(columns=['<>&"', None])
styler = Styler(df, uuid_len=0)
styler.format_index("X&{0}>X", escape="html", na_rep="&", axis=1)
ctx = styler._translate(True, True)
assert ctx["head"][0][1]["display_value"] == "X&&lt;&gt;&amp;&#34;>X"
assert ctx["head"][0][2]["display_value"] == "&"
def test_format_escape_floats(styler):
# test given formatter for number format is not impacted by escape
s = styler.format("{:.1f}", escape="html")
for expected in [">0.0<", ">1.0<", ">-1.2<", ">-0.6<"]:
assert expected in s.to_html()
# tests precision of floats is not impacted by escape
s = styler.format(precision=1, escape="html")
for expected in [">0<", ">1<", ">-1.2<", ">-0.6<"]:
assert expected in s.to_html()
@pytest.mark.parametrize("formatter", [5, True, [2.0]])
@pytest.mark.parametrize("func", ["format", "format_index"])
def test_format_raises(styler, formatter, func):
with pytest.raises(TypeError, match="expected str or callable"):
getattr(styler, func)(formatter)
@pytest.mark.parametrize(
"precision, expected",
[
(1, ["1.0", "2.0", "3.2", "4.6"]),
(2, ["1.00", "2.01", "3.21", "4.57"]),
(3, ["1.000", "2.009", "3.212", "4.566"]),
],
)
def test_format_with_precision(precision, expected):
# Issue #13257
df = DataFrame([[1.0, 2.0090, 3.2121, 4.566]], columns=[1.0, 2.0090, 3.2121, 4.566])
styler = Styler(df)
styler.format(precision=precision)
styler.format_index(precision=precision, axis=1)
ctx = styler._translate(True, True)
for col, exp in enumerate(expected):
assert ctx["body"][0][col + 1]["display_value"] == exp # format test
assert ctx["head"][0][col + 1]["display_value"] == exp # format_index test
@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize(
"level, expected",
[
(0, ["X", "X", "_", "_"]), # level int
("zero", ["X", "X", "_", "_"]), # level name
(1, ["_", "_", "X", "X"]), # other level int
("one", ["_", "_", "X", "X"]), # other level name
([0, 1], ["X", "X", "X", "X"]), # both levels
([0, "zero"], ["X", "X", "_", "_"]), # level int and name simultaneous
([0, "one"], ["X", "X", "X", "X"]), # both levels as int and name
(["one", "zero"], ["X", "X", "X", "X"]), # both level names, reversed
],
)
def test_format_index_level(axis, level, expected):
midx = MultiIndex.from_arrays([["_", "_"], ["_", "_"]], names=["zero", "one"])
df = DataFrame([[1, 2], [3, 4]])
if axis == 0:
df.index = midx
else:
df.columns = midx
styler = df.style.format_index(lambda v: "X", level=level, axis=axis)
ctx = styler._translate(True, True)
if axis == 0: # compare index
result = [ctx["body"][s][0]["display_value"] for s in range(2)]
result += [ctx["body"][s][1]["display_value"] for s in range(2)]
else: # compare columns
result = [ctx["head"][0][s + 1]["display_value"] for s in range(2)]
result += [ctx["head"][1][s + 1]["display_value"] for s in range(2)]
assert expected == result
def test_format_subset():
df = DataFrame([[0.1234, 0.1234], [1.1234, 1.1234]], columns=["a", "b"])
ctx = df.style.format(
{"a": "{:0.1f}", "b": "{0:.2%}"}, subset=IndexSlice[0, :]
)._translate(True, True)
expected = "0.1"
raw_11 = "1.123400"
assert ctx["body"][0][1]["display_value"] == expected
assert ctx["body"][1][1]["display_value"] == raw_11
assert ctx["body"][0][2]["display_value"] == "12.34%"
ctx = df.style.format("{:0.1f}", subset=IndexSlice[0, :])._translate(True, True)
assert ctx["body"][0][1]["display_value"] == expected
assert ctx["body"][1][1]["display_value"] == raw_11
ctx = df.style.format("{:0.1f}", subset=IndexSlice["a"])._translate(True, True)
assert ctx["body"][0][1]["display_value"] == expected
assert ctx["body"][0][2]["display_value"] == "0.123400"
ctx = df.style.format("{:0.1f}", subset=IndexSlice[0, "a"])._translate(True, True)
assert ctx["body"][0][1]["display_value"] == expected
assert ctx["body"][1][1]["display_value"] == raw_11
ctx = df.style.format("{:0.1f}", subset=IndexSlice[[0, 1], ["a"]])._translate(
True, True
)
assert ctx["body"][0][1]["display_value"] == expected
assert ctx["body"][1][1]["display_value"] == "1.1"
assert ctx["body"][0][2]["display_value"] == "0.123400"
assert ctx["body"][1][2]["display_value"] == raw_11
@pytest.mark.parametrize("formatter", [None, "{:,.1f}"])
@pytest.mark.parametrize("decimal", [".", "*"])
@pytest.mark.parametrize("precision", [None, 2])
@pytest.mark.parametrize("func, col", [("format", 1), ("format_index", 0)])
def test_format_thousands(formatter, decimal, precision, func, col):
styler = DataFrame([[1000000.123456789]], index=[1000000.123456789]).style
result = getattr(styler, func)( # testing float
thousands="_", formatter=formatter, decimal=decimal, precision=precision
)._translate(True, True)
assert "1_000_000" in result["body"][0][col]["display_value"]
styler = DataFrame([[1000000]], index=[1000000]).style
result = getattr(styler, func)( # testing int
thousands="_", formatter=formatter, decimal=decimal, precision=precision
)._translate(True, True)
assert "1_000_000" in result["body"][0][col]["display_value"]
styler = DataFrame([[1 + 1000000.123456789j]], index=[1 + 1000000.123456789j]).style
result = getattr(styler, func)( # testing complex
thousands="_", formatter=formatter, decimal=decimal, precision=precision
)._translate(True, True)
assert "1_000_000" in result["body"][0][col]["display_value"]
@pytest.mark.parametrize("formatter", [None, "{:,.4f}"])
@pytest.mark.parametrize("thousands", [None, ",", "*"])
@pytest.mark.parametrize("precision", [None, 4])
@pytest.mark.parametrize("func, col", [("format", 1), ("format_index", 0)])
def test_format_decimal(formatter, thousands, precision, func, col):
styler = DataFrame([[1000000.123456789]], index=[1000000.123456789]).style
result = getattr(styler, func)( # testing float
decimal="_", formatter=formatter, thousands=thousands, precision=precision
)._translate(True, True)
assert "000_123" in result["body"][0][col]["display_value"]
styler = DataFrame([[1 + 1000000.123456789j]], index=[1 + 1000000.123456789j]).style
result = getattr(styler, func)( # testing complex
decimal="_", formatter=formatter, thousands=thousands, precision=precision
)._translate(True, True)
assert "000_123" in result["body"][0][col]["display_value"]
def test_str_escape_error():
msg = "`escape` only permitted in {'html', 'latex', 'latex-math'}, got "
with pytest.raises(ValueError, match=msg):
_str_escape("text", "bad_escape")
with pytest.raises(ValueError, match=msg):
_str_escape("text", [])
_str_escape(2.00, "bad_escape") # OK since dtype is float
def test_long_int_formatting():
df = DataFrame(data=[[1234567890123456789]], columns=["test"])
styler = df.style
ctx = styler._translate(True, True)
assert ctx["body"][0][1]["display_value"] == "1234567890123456789"
styler = df.style.format(thousands="_")
ctx = styler._translate(True, True)
assert ctx["body"][0][1]["display_value"] == "1_234_567_890_123_456_789"
def test_format_options():
df = DataFrame({"int": [2000, 1], "float": [1.009, None], "str": ["&<", "&~"]})
ctx = df.style._translate(True, True)
# test option: na_rep
assert ctx["body"][1][2]["display_value"] == "nan"
with option_context("styler.format.na_rep", "MISSING"):
ctx_with_op = df.style._translate(True, True)
assert ctx_with_op["body"][1][2]["display_value"] == "MISSING"
# test option: decimal and precision
assert ctx["body"][0][2]["display_value"] == "1.009000"
with option_context("styler.format.decimal", "_"):
ctx_with_op = df.style._translate(True, True)
assert ctx_with_op["body"][0][2]["display_value"] == "1_009000"
with option_context("styler.format.precision", 2):
ctx_with_op = df.style._translate(True, True)
assert ctx_with_op["body"][0][2]["display_value"] == "1.01"
# test option: thousands
assert ctx["body"][0][1]["display_value"] == "2000"
with option_context("styler.format.thousands", "_"):
ctx_with_op = df.style._translate(True, True)
assert ctx_with_op["body"][0][1]["display_value"] == "2_000"
# test option: escape
assert ctx["body"][0][3]["display_value"] == "&<"
assert ctx["body"][1][3]["display_value"] == "&~"
with option_context("styler.format.escape", "html"):
ctx_with_op = df.style._translate(True, True)
assert ctx_with_op["body"][0][3]["display_value"] == "&amp;&lt;"
with option_context("styler.format.escape", "latex"):
ctx_with_op = df.style._translate(True, True)
assert ctx_with_op["body"][1][3]["display_value"] == "\\&\\textasciitilde "
with option_context("styler.format.escape", "latex-math"):
ctx_with_op = df.style._translate(True, True)
assert ctx_with_op["body"][1][3]["display_value"] == "\\&\\textasciitilde "
# test option: formatter
with option_context("styler.format.formatter", {"int": "{:,.2f}"}):
ctx_with_op = df.style._translate(True, True)
assert ctx_with_op["body"][0][1]["display_value"] == "2,000.00"
def test_precision_zero(df):
styler = Styler(df, precision=0)
ctx = styler._translate(True, True)
assert ctx["body"][0][2]["display_value"] == "-1"
assert ctx["body"][1][2]["display_value"] == "-1"
@pytest.mark.parametrize(
"formatter, exp",
[
(lambda x: f"{x:.3f}", "9.000"),
("{:.2f}", "9.00"),
({0: "{:.1f}"}, "9.0"),
(None, "9"),
],
)
def test_formatter_options_validator(formatter, exp):
df = DataFrame([[9]])
with option_context("styler.format.formatter", formatter):
assert f" {exp} " in df.style.to_latex()
def test_formatter_options_raises():
msg = "Value must be an instance of"
with pytest.raises(ValueError, match=msg):
with option_context("styler.format.formatter", ["bad", "type"]):
DataFrame().style.to_latex()
def test_1level_multiindex():
# GH 43383
midx = MultiIndex.from_product([[1, 2]], names=[""])
df = DataFrame(-1, index=midx, columns=[0, 1])
ctx = df.style._translate(True, True)
assert ctx["body"][0][0]["display_value"] == "1"
assert ctx["body"][0][0]["is_visible"] is True
assert ctx["body"][1][0]["display_value"] == "2"
assert ctx["body"][1][0]["is_visible"] is True
def test_boolean_format():
# gh 46384: booleans do not collapse to integer representation on display
df = DataFrame([[True, False]])
ctx = df.style._translate(True, True)
assert ctx["body"][0][1]["display_value"] is True
assert ctx["body"][0][2]["display_value"] is False
@pytest.mark.parametrize(
"hide, labels",
[
(False, [1, 2]),
(True, [1, 2, 3, 4]),
],
)
def test_relabel_raise_length(styler_multi, hide, labels):
if hide:
styler_multi.hide(axis=0, subset=[("X", "x"), ("Y", "y")])
with pytest.raises(ValueError, match="``labels`` must be of length equal"):
styler_multi.relabel_index(labels=labels)
def test_relabel_index(styler_multi):
labels = [(1, 2), (3, 4)]
styler_multi.hide(axis=0, subset=[("X", "x"), ("Y", "y")])
styler_multi.relabel_index(labels=labels)
ctx = styler_multi._translate(True, True)
assert {"value": "X", "display_value": 1}.items() <= ctx["body"][0][0].items()
assert {"value": "y", "display_value": 2}.items() <= ctx["body"][0][1].items()
assert {"value": "Y", "display_value": 3}.items() <= ctx["body"][1][0].items()
assert {"value": "x", "display_value": 4}.items() <= ctx["body"][1][1].items()
def test_relabel_columns(styler_multi):
labels = [(1, 2), (3, 4)]
styler_multi.hide(axis=1, subset=[("A", "a"), ("B", "b")])
styler_multi.relabel_index(axis=1, labels=labels)
ctx = styler_multi._translate(True, True)
assert {"value": "A", "display_value": 1}.items() <= ctx["head"][0][3].items()
assert {"value": "B", "display_value": 3}.items() <= ctx["head"][0][4].items()
assert {"value": "b", "display_value": 2}.items() <= ctx["head"][1][3].items()
assert {"value": "a", "display_value": 4}.items() <= ctx["head"][1][4].items()
def test_relabel_roundtrip(styler):
styler.relabel_index(["{}", "{}"])
ctx = styler._translate(True, True)
assert {"value": "x", "display_value": "x"}.items() <= ctx["body"][0][0].items()
assert {"value": "y", "display_value": "y"}.items() <= ctx["body"][1][0].items()

View File

@@ -0,0 +1,218 @@
import numpy as np
import pytest
from pandas import (
NA,
DataFrame,
IndexSlice,
)
pytest.importorskip("jinja2")
from pandas.io.formats.style import Styler
@pytest.fixture(params=[(None, "float64"), (NA, "Int64")])
def df(request):
# GH 45804
return DataFrame(
{"A": [0, np.nan, 10], "B": [1, request.param[0], 2]}, dtype=request.param[1]
)
@pytest.fixture
def styler(df):
return Styler(df, uuid_len=0)
def test_highlight_null(styler):
result = styler.highlight_null()._compute().ctx
expected = {
(1, 0): [("background-color", "red")],
(1, 1): [("background-color", "red")],
}
assert result == expected
def test_highlight_null_subset(styler):
# GH 31345
result = (
styler.highlight_null(color="red", subset=["A"])
.highlight_null(color="green", subset=["B"])
._compute()
.ctx
)
expected = {
(1, 0): [("background-color", "red")],
(1, 1): [("background-color", "green")],
}
assert result == expected
@pytest.mark.parametrize("f", ["highlight_min", "highlight_max"])
def test_highlight_minmax_basic(df, f):
expected = {
(0, 1): [("background-color", "red")],
# ignores NaN row,
(2, 0): [("background-color", "red")],
}
if f == "highlight_min":
df = -df
result = getattr(df.style, f)(axis=1, color="red")._compute().ctx
assert result == expected
@pytest.mark.parametrize("f", ["highlight_min", "highlight_max"])
@pytest.mark.parametrize(
"kwargs",
[
{"axis": None, "color": "red"}, # test axis
{"axis": 0, "subset": ["A"], "color": "red"}, # test subset and ignores NaN
{"axis": None, "props": "background-color: red"}, # test props
],
)
def test_highlight_minmax_ext(df, f, kwargs):
expected = {(2, 0): [("background-color", "red")]}
if f == "highlight_min":
df = -df
result = getattr(df.style, f)(**kwargs)._compute().ctx
assert result == expected
@pytest.mark.parametrize("f", ["highlight_min", "highlight_max"])
@pytest.mark.parametrize("axis", [None, 0, 1])
def test_highlight_minmax_nulls(f, axis):
# GH 42750
expected = {
(1, 0): [("background-color", "yellow")],
(1, 1): [("background-color", "yellow")],
}
if axis == 1:
expected.update({(2, 1): [("background-color", "yellow")]})
if f == "highlight_max":
df = DataFrame({"a": [NA, 1, None], "b": [np.nan, 1, -1]})
else:
df = DataFrame({"a": [NA, -1, None], "b": [np.nan, -1, 1]})
result = getattr(df.style, f)(axis=axis)._compute().ctx
assert result == expected
@pytest.mark.parametrize(
"kwargs",
[
{"left": 0, "right": 1}, # test basic range
{"left": 0, "right": 1, "props": "background-color: yellow"}, # test props
{"left": -100, "right": 100, "subset": IndexSlice[[0, 1], :]}, # test subset
{"left": 0, "subset": IndexSlice[[0, 1], :]}, # test no right
{"right": 1}, # test no left
{"left": [0, 0, 11], "axis": 0}, # test left as sequence
{"left": DataFrame({"A": [0, 0, 11], "B": [1, 1, 11]}), "axis": None}, # axis
{"left": 0, "right": [0, 1], "axis": 1}, # test sequence right
],
)
def test_highlight_between(styler, kwargs):
expected = {
(0, 0): [("background-color", "yellow")],
(0, 1): [("background-color", "yellow")],
}
result = styler.highlight_between(**kwargs)._compute().ctx
assert result == expected
@pytest.mark.parametrize(
"arg, map, axis",
[
("left", [1, 2], 0), # 0 axis has 3 elements not 2
("left", [1, 2, 3], 1), # 1 axis has 2 elements not 3
("left", np.array([[1, 2], [1, 2]]), None), # df is (2,3) not (2,2)
("right", [1, 2], 0), # same tests as above for 'right' not 'left'
("right", [1, 2, 3], 1), # ..
("right", np.array([[1, 2], [1, 2]]), None), # ..
],
)
def test_highlight_between_raises(arg, styler, map, axis):
msg = f"supplied '{arg}' is not correct shape"
with pytest.raises(ValueError, match=msg):
styler.highlight_between(**{arg: map, "axis": axis})._compute()
def test_highlight_between_raises2(styler):
msg = "values can be 'both', 'left', 'right', or 'neither'"
with pytest.raises(ValueError, match=msg):
styler.highlight_between(inclusive="badstring")._compute()
with pytest.raises(ValueError, match=msg):
styler.highlight_between(inclusive=1)._compute()
@pytest.mark.parametrize(
"inclusive, expected",
[
(
"both",
{
(0, 0): [("background-color", "yellow")],
(0, 1): [("background-color", "yellow")],
},
),
("neither", {}),
("left", {(0, 0): [("background-color", "yellow")]}),
("right", {(0, 1): [("background-color", "yellow")]}),
],
)
def test_highlight_between_inclusive(styler, inclusive, expected):
kwargs = {"left": 0, "right": 1, "subset": IndexSlice[[0, 1], :]}
result = styler.highlight_between(**kwargs, inclusive=inclusive)._compute()
assert result.ctx == expected
@pytest.mark.parametrize(
"kwargs",
[
{"q_left": 0.5, "q_right": 1, "axis": 0}, # base case
{"q_left": 0.5, "q_right": 1, "axis": None}, # test axis
{"q_left": 0, "q_right": 1, "subset": IndexSlice[2, :]}, # test subset
{"q_left": 0.5, "axis": 0}, # test no high
{"q_right": 1, "subset": IndexSlice[2, :], "axis": 1}, # test no low
{"q_left": 0.5, "axis": 0, "props": "background-color: yellow"}, # tst prop
],
)
def test_highlight_quantile(styler, kwargs):
expected = {
(2, 0): [("background-color", "yellow")],
(2, 1): [("background-color", "yellow")],
}
result = styler.highlight_quantile(**kwargs)._compute().ctx
assert result == expected
@pytest.mark.parametrize(
"f,kwargs",
[
("highlight_min", {"axis": 1, "subset": IndexSlice[1, :]}),
("highlight_max", {"axis": 0, "subset": [0]}),
("highlight_quantile", {"axis": None, "q_left": 0.6, "q_right": 0.8}),
("highlight_between", {"subset": [0]}),
],
)
@pytest.mark.parametrize(
"df",
[
DataFrame([[0, 10], [20, 30]], dtype=int),
DataFrame([[0, 10], [20, 30]], dtype=float),
DataFrame([[0, 10], [20, 30]], dtype="datetime64[ns]"),
DataFrame([[0, 10], [20, 30]], dtype=str),
DataFrame([[0, 10], [20, 30]], dtype="timedelta64[ns]"),
],
)
def test_all_highlight_dtypes(f, kwargs, df):
if f == "highlight_quantile" and isinstance(df.iloc[0, 0], (str)):
return None # quantile incompatible with str
if f == "highlight_between":
kwargs["left"] = df.iloc[1, 0] # set the range low for testing
expected = {(1, 0): [("background-color", "yellow")]}
result = getattr(df.style, f)(**kwargs)._compute().ctx
assert result == expected

View File

@@ -0,0 +1,335 @@
import gc
import numpy as np
import pytest
from pandas import (
DataFrame,
IndexSlice,
Series,
)
pytest.importorskip("matplotlib")
pytest.importorskip("jinja2")
import matplotlib as mpl
from pandas.io.formats.style import Styler
@pytest.fixture(autouse=True)
def mpl_cleanup():
# matplotlib/testing/decorators.py#L24
# 1) Resets units registry
# 2) Resets rc_context
# 3) Closes all figures
mpl = pytest.importorskip("matplotlib")
mpl_units = pytest.importorskip("matplotlib.units")
plt = pytest.importorskip("matplotlib.pyplot")
orig_units_registry = mpl_units.registry.copy()
with mpl.rc_context():
mpl.use("template")
yield
mpl_units.registry.clear()
mpl_units.registry.update(orig_units_registry)
plt.close("all")
# https://matplotlib.org/stable/users/prev_whats_new/whats_new_3.6.0.html#garbage-collection-is-no-longer-run-on-figure-close # noqa: E501
gc.collect(1)
@pytest.fixture
def df():
return DataFrame([[1, 2], [2, 4]], columns=["A", "B"])
@pytest.fixture
def styler(df):
return Styler(df, uuid_len=0)
@pytest.fixture
def df_blank():
return DataFrame([[0, 0], [0, 0]], columns=["A", "B"], index=["X", "Y"])
@pytest.fixture
def styler_blank(df_blank):
return Styler(df_blank, uuid_len=0)
@pytest.mark.parametrize("f", ["background_gradient", "text_gradient"])
def test_function_gradient(styler, f):
for c_map in [None, "YlOrRd"]:
result = getattr(styler, f)(cmap=c_map)._compute().ctx
assert all("#" in x[0][1] for x in result.values())
assert result[(0, 0)] == result[(0, 1)]
assert result[(1, 0)] == result[(1, 1)]
@pytest.mark.parametrize("f", ["background_gradient", "text_gradient"])
def test_background_gradient_color(styler, f):
result = getattr(styler, f)(subset=IndexSlice[1, "A"])._compute().ctx
if f == "background_gradient":
assert result[(1, 0)] == [("background-color", "#fff7fb"), ("color", "#000000")]
elif f == "text_gradient":
assert result[(1, 0)] == [("color", "#fff7fb")]
@pytest.mark.parametrize(
"axis, expected",
[
(0, ["low", "low", "high", "high"]),
(1, ["low", "high", "low", "high"]),
(None, ["low", "mid", "mid", "high"]),
],
)
@pytest.mark.parametrize("f", ["background_gradient", "text_gradient"])
def test_background_gradient_axis(styler, axis, expected, f):
if f == "background_gradient":
colors = {
"low": [("background-color", "#f7fbff"), ("color", "#000000")],
"mid": [("background-color", "#abd0e6"), ("color", "#000000")],
"high": [("background-color", "#08306b"), ("color", "#f1f1f1")],
}
elif f == "text_gradient":
colors = {
"low": [("color", "#f7fbff")],
"mid": [("color", "#abd0e6")],
"high": [("color", "#08306b")],
}
result = getattr(styler, f)(cmap="Blues", axis=axis)._compute().ctx
for i, cell in enumerate([(0, 0), (0, 1), (1, 0), (1, 1)]):
assert result[cell] == colors[expected[i]]
@pytest.mark.parametrize(
"cmap, expected",
[
(
"PuBu",
{
(4, 5): [("background-color", "#86b0d3"), ("color", "#000000")],
(4, 6): [("background-color", "#83afd3"), ("color", "#f1f1f1")],
},
),
(
"YlOrRd",
{
(4, 8): [("background-color", "#fd913e"), ("color", "#000000")],
(4, 9): [("background-color", "#fd8f3d"), ("color", "#f1f1f1")],
},
),
(
None,
{
(7, 0): [("background-color", "#48c16e"), ("color", "#f1f1f1")],
(7, 1): [("background-color", "#4cc26c"), ("color", "#000000")],
},
),
],
)
def test_text_color_threshold(cmap, expected):
# GH 39888
df = DataFrame(np.arange(100).reshape(10, 10))
result = df.style.background_gradient(cmap=cmap, axis=None)._compute().ctx
for k in expected.keys():
assert result[k] == expected[k]
def test_background_gradient_vmin_vmax():
# GH 12145
df = DataFrame(range(5))
ctx = df.style.background_gradient(vmin=1, vmax=3)._compute().ctx
assert ctx[(0, 0)] == ctx[(1, 0)]
assert ctx[(4, 0)] == ctx[(3, 0)]
def test_background_gradient_int64():
# GH 28869
df1 = Series(range(3)).to_frame()
df2 = Series(range(3), dtype="Int64").to_frame()
ctx1 = df1.style.background_gradient()._compute().ctx
ctx2 = df2.style.background_gradient()._compute().ctx
assert ctx2[(0, 0)] == ctx1[(0, 0)]
assert ctx2[(1, 0)] == ctx1[(1, 0)]
assert ctx2[(2, 0)] == ctx1[(2, 0)]
@pytest.mark.parametrize(
"axis, gmap, expected",
[
(
0,
[1, 2],
{
(0, 0): [("background-color", "#fff7fb"), ("color", "#000000")],
(1, 0): [("background-color", "#023858"), ("color", "#f1f1f1")],
(0, 1): [("background-color", "#fff7fb"), ("color", "#000000")],
(1, 1): [("background-color", "#023858"), ("color", "#f1f1f1")],
},
),
(
1,
[1, 2],
{
(0, 0): [("background-color", "#fff7fb"), ("color", "#000000")],
(1, 0): [("background-color", "#fff7fb"), ("color", "#000000")],
(0, 1): [("background-color", "#023858"), ("color", "#f1f1f1")],
(1, 1): [("background-color", "#023858"), ("color", "#f1f1f1")],
},
),
(
None,
np.array([[2, 1], [1, 2]]),
{
(0, 0): [("background-color", "#023858"), ("color", "#f1f1f1")],
(1, 0): [("background-color", "#fff7fb"), ("color", "#000000")],
(0, 1): [("background-color", "#fff7fb"), ("color", "#000000")],
(1, 1): [("background-color", "#023858"), ("color", "#f1f1f1")],
},
),
],
)
def test_background_gradient_gmap_array(styler_blank, axis, gmap, expected):
# tests when gmap is given as a sequence and converted to ndarray
result = styler_blank.background_gradient(axis=axis, gmap=gmap)._compute().ctx
assert result == expected
@pytest.mark.parametrize(
"gmap, axis", [([1, 2, 3], 0), ([1, 2], 1), (np.array([[1, 2], [1, 2]]), None)]
)
def test_background_gradient_gmap_array_raises(gmap, axis):
# test when gmap as converted ndarray is bad shape
df = DataFrame([[0, 0, 0], [0, 0, 0]])
msg = "supplied 'gmap' is not correct shape"
with pytest.raises(ValueError, match=msg):
df.style.background_gradient(gmap=gmap, axis=axis)._compute()
@pytest.mark.parametrize(
"gmap",
[
DataFrame( # reverse the columns
[[2, 1], [1, 2]], columns=["B", "A"], index=["X", "Y"]
),
DataFrame( # reverse the index
[[2, 1], [1, 2]], columns=["A", "B"], index=["Y", "X"]
),
DataFrame( # reverse the index and columns
[[1, 2], [2, 1]], columns=["B", "A"], index=["Y", "X"]
),
DataFrame( # add unnecessary columns
[[1, 2, 3], [2, 1, 3]], columns=["A", "B", "C"], index=["X", "Y"]
),
DataFrame( # add unnecessary index
[[1, 2], [2, 1], [3, 3]], columns=["A", "B"], index=["X", "Y", "Z"]
),
],
)
@pytest.mark.parametrize(
"subset, exp_gmap", # exp_gmap is underlying map DataFrame should conform to
[
(None, [[1, 2], [2, 1]]),
(["A"], [[1], [2]]), # slice only column "A" in data and gmap
(["B", "A"], [[2, 1], [1, 2]]), # reverse the columns in data
(IndexSlice["X", :], [[1, 2]]), # slice only index "X" in data and gmap
(IndexSlice[["Y", "X"], :], [[2, 1], [1, 2]]), # reverse the index in data
],
)
def test_background_gradient_gmap_dataframe_align(styler_blank, gmap, subset, exp_gmap):
# test gmap given as DataFrame that it aligns to the data including subset
expected = styler_blank.background_gradient(axis=None, gmap=exp_gmap, subset=subset)
result = styler_blank.background_gradient(axis=None, gmap=gmap, subset=subset)
assert expected._compute().ctx == result._compute().ctx
@pytest.mark.parametrize(
"gmap, axis, exp_gmap",
[
(Series([2, 1], index=["Y", "X"]), 0, [[1, 1], [2, 2]]), # revrse the index
(Series([2, 1], index=["B", "A"]), 1, [[1, 2], [1, 2]]), # revrse the cols
(Series([1, 2, 3], index=["X", "Y", "Z"]), 0, [[1, 1], [2, 2]]), # add idx
(Series([1, 2, 3], index=["A", "B", "C"]), 1, [[1, 2], [1, 2]]), # add col
],
)
def test_background_gradient_gmap_series_align(styler_blank, gmap, axis, exp_gmap):
# test gmap given as Series that it aligns to the data including subset
expected = styler_blank.background_gradient(axis=None, gmap=exp_gmap)._compute()
result = styler_blank.background_gradient(axis=axis, gmap=gmap)._compute()
assert expected.ctx == result.ctx
@pytest.mark.parametrize(
"gmap, axis",
[
(DataFrame([[1, 2], [2, 1]], columns=["A", "B"], index=["X", "Y"]), 1),
(DataFrame([[1, 2], [2, 1]], columns=["A", "B"], index=["X", "Y"]), 0),
],
)
def test_background_gradient_gmap_wrong_dataframe(styler_blank, gmap, axis):
# test giving a gmap in DataFrame but with wrong axis
msg = "'gmap' is a DataFrame but underlying data for operations is a Series"
with pytest.raises(ValueError, match=msg):
styler_blank.background_gradient(gmap=gmap, axis=axis)._compute()
def test_background_gradient_gmap_wrong_series(styler_blank):
# test giving a gmap in Series form but with wrong axis
msg = "'gmap' is a Series but underlying data for operations is a DataFrame"
gmap = Series([1, 2], index=["X", "Y"])
with pytest.raises(ValueError, match=msg):
styler_blank.background_gradient(gmap=gmap, axis=None)._compute()
def test_background_gradient_nullable_dtypes():
# GH 50712
df1 = DataFrame([[1], [0], [np.nan]], dtype=float)
df2 = DataFrame([[1], [0], [None]], dtype="Int64")
ctx1 = df1.style.background_gradient()._compute().ctx
ctx2 = df2.style.background_gradient()._compute().ctx
assert ctx1 == ctx2
@pytest.mark.parametrize(
"cmap",
["PuBu", mpl.colormaps["PuBu"]],
)
def test_bar_colormap(cmap):
data = DataFrame([[1, 2], [3, 4]])
ctx = data.style.bar(cmap=cmap, axis=None)._compute().ctx
pubu_colors = {
(0, 0): "#d0d1e6",
(1, 0): "#056faf",
(0, 1): "#73a9cf",
(1, 1): "#023858",
}
for k, v in pubu_colors.items():
assert v in ctx[k][1][1]
def test_bar_color_raises(df):
msg = "`color` must be string or list or tuple of 2 strings"
with pytest.raises(ValueError, match=msg):
df.style.bar(color={"a", "b"}).to_html()
with pytest.raises(ValueError, match=msg):
df.style.bar(color=["a", "b", "c"]).to_html()
msg = "`color` and `cmap` cannot both be given"
with pytest.raises(ValueError, match=msg):
df.style.bar(color="something", cmap="something else").to_html()
@pytest.mark.parametrize(
"plot_method",
["scatter", "hexbin"],
)
def test_pass_colormap_instance(df, plot_method):
# https://github.com/pandas-dev/pandas/issues/49374
cmap = mpl.colors.ListedColormap([[1, 1, 1], [0, 0, 0]])
df["c"] = df.A + df.B
kwargs = {"x": "A", "y": "B", "c": "c", "colormap": cmap}
if plot_method == "hexbin":
kwargs["C"] = kwargs.pop("c")
getattr(df.plot, plot_method)(**kwargs)

View File

@@ -0,0 +1,140 @@
from textwrap import dedent
import pytest
from pandas import (
DataFrame,
IndexSlice,
)
pytest.importorskip("jinja2")
from pandas.io.formats.style import Styler
@pytest.fixture
def df():
return DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=["i", "j", "j"],
columns=["c", "d", "d"],
dtype=float,
)
@pytest.fixture
def styler(df):
return Styler(df, uuid_len=0)
def test_format_non_unique(df):
# GH 41269
# test dict
html = df.style.format({"d": "{:.1f}"}).to_html()
for val in ["1.000000<", "4.000000<", "7.000000<"]:
assert val in html
for val in ["2.0<", "3.0<", "5.0<", "6.0<", "8.0<", "9.0<"]:
assert val in html
# test subset
html = df.style.format(precision=1, subset=IndexSlice["j", "d"]).to_html()
for val in ["1.000000<", "4.000000<", "7.000000<", "2.000000<", "3.000000<"]:
assert val in html
for val in ["5.0<", "6.0<", "8.0<", "9.0<"]:
assert val in html
@pytest.mark.parametrize("func", ["apply", "map"])
def test_apply_map_non_unique_raises(df, func):
# GH 41269
if func == "apply":
op = lambda s: ["color: red;"] * len(s)
else:
op = lambda v: "color: red;"
with pytest.raises(KeyError, match="`Styler.apply` and `.map` are not"):
getattr(df.style, func)(op)._compute()
def test_table_styles_dict_non_unique_index(styler):
styles = styler.set_table_styles(
{"j": [{"selector": "td", "props": "a: v;"}]}, axis=1
).table_styles
assert styles == [
{"selector": "td.row1", "props": [("a", "v")]},
{"selector": "td.row2", "props": [("a", "v")]},
]
def test_table_styles_dict_non_unique_columns(styler):
styles = styler.set_table_styles(
{"d": [{"selector": "td", "props": "a: v;"}]}, axis=0
).table_styles
assert styles == [
{"selector": "td.col1", "props": [("a", "v")]},
{"selector": "td.col2", "props": [("a", "v")]},
]
def test_tooltips_non_unique_raises(styler):
# ttips has unique keys
ttips = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "d"], index=["a", "b"])
styler.set_tooltips(ttips=ttips) # OK
# ttips has non-unique columns
ttips = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "c"], index=["a", "b"])
with pytest.raises(KeyError, match="Tooltips render only if `ttips` has unique"):
styler.set_tooltips(ttips=ttips)
# ttips has non-unique index
ttips = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "d"], index=["a", "a"])
with pytest.raises(KeyError, match="Tooltips render only if `ttips` has unique"):
styler.set_tooltips(ttips=ttips)
def test_set_td_classes_non_unique_raises(styler):
# classes has unique keys
classes = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "d"], index=["a", "b"])
styler.set_td_classes(classes=classes) # OK
# classes has non-unique columns
classes = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "c"], index=["a", "b"])
with pytest.raises(KeyError, match="Classes render only if `classes` has unique"):
styler.set_td_classes(classes=classes)
# classes has non-unique index
classes = DataFrame([["1", "2"], ["3", "4"]], columns=["c", "d"], index=["a", "a"])
with pytest.raises(KeyError, match="Classes render only if `classes` has unique"):
styler.set_td_classes(classes=classes)
def test_hide_columns_non_unique(styler):
ctx = styler.hide(["d"], axis="columns")._translate(True, True)
assert ctx["head"][0][1]["display_value"] == "c"
assert ctx["head"][0][1]["is_visible"] is True
assert ctx["head"][0][2]["display_value"] == "d"
assert ctx["head"][0][2]["is_visible"] is False
assert ctx["head"][0][3]["display_value"] == "d"
assert ctx["head"][0][3]["is_visible"] is False
assert ctx["body"][0][1]["is_visible"] is True
assert ctx["body"][0][2]["is_visible"] is False
assert ctx["body"][0][3]["is_visible"] is False
def test_latex_non_unique(styler):
result = styler.to_latex()
assert result == dedent(
"""\
\\begin{tabular}{lrrr}
& c & d & d \\\\
i & 1.000000 & 2.000000 & 3.000000 \\\\
j & 4.000000 & 5.000000 & 6.000000 \\\\
j & 7.000000 & 8.000000 & 9.000000 \\\\
\\end{tabular}
"""
)

View File

@@ -0,0 +1,91 @@
from textwrap import dedent
import pytest
from pandas import DataFrame
pytest.importorskip("jinja2")
from pandas.io.formats.style import Styler
@pytest.fixture
def df():
return DataFrame({"A": [0, 1], "B": [-0.61, -1.22], "C": ["ab", "cd"]})
@pytest.fixture
def styler(df):
return Styler(df, uuid_len=0, precision=2)
def test_basic_string(styler):
result = styler.to_string()
expected = dedent(
"""\
A B C
0 0 -0.61 ab
1 1 -1.22 cd
"""
)
assert result == expected
def test_string_delimiter(styler):
result = styler.to_string(delimiter=";")
expected = dedent(
"""\
;A;B;C
0;0;-0.61;ab
1;1;-1.22;cd
"""
)
assert result == expected
def test_concat(styler):
result = styler.concat(styler.data.agg(["sum"]).style).to_string()
expected = dedent(
"""\
A B C
0 0 -0.61 ab
1 1 -1.22 cd
sum 1 -1.830000 abcd
"""
)
assert result == expected
def test_concat_recursion(styler):
df = styler.data
styler1 = styler
styler2 = Styler(df.agg(["sum"]), uuid_len=0, precision=3)
styler3 = Styler(df.agg(["sum"]), uuid_len=0, precision=4)
result = styler1.concat(styler2.concat(styler3)).to_string()
expected = dedent(
"""\
A B C
0 0 -0.61 ab
1 1 -1.22 cd
sum 1 -1.830 abcd
sum 1 -1.8300 abcd
"""
)
assert result == expected
def test_concat_chain(styler):
df = styler.data
styler1 = styler
styler2 = Styler(df.agg(["sum"]), uuid_len=0, precision=3)
styler3 = Styler(df.agg(["sum"]), uuid_len=0, precision=4)
result = styler1.concat(styler2).concat(styler3).to_string()
expected = dedent(
"""\
A B C
0 0 -0.61 ab
1 1 -1.22 cd
sum 1 -1.830 abcd
sum 1 -1.8300 abcd
"""
)
assert result == expected

View File

@@ -0,0 +1,85 @@
import numpy as np
import pytest
from pandas import DataFrame
pytest.importorskip("jinja2")
from pandas.io.formats.style import Styler
@pytest.fixture
def df():
return DataFrame(
data=[[0, 1, 2], [3, 4, 5], [6, 7, 8]],
columns=["A", "B", "C"],
index=["x", "y", "z"],
)
@pytest.fixture
def styler(df):
return Styler(df, uuid_len=0)
@pytest.mark.parametrize(
"ttips",
[
DataFrame( # Test basic reindex and ignoring blank
data=[["Min", "Max"], [np.nan, ""]],
columns=["A", "C"],
index=["x", "y"],
),
DataFrame( # Test non-referenced columns, reversed col names, short index
data=[["Max", "Min", "Bad-Col"]], columns=["C", "A", "D"], index=["x"]
),
],
)
def test_tooltip_render(ttips, styler):
# GH 21266
result = styler.set_tooltips(ttips).to_html()
# test tooltip table level class
assert "#T_ .pd-t {\n visibility: hidden;\n" in result
# test 'Min' tooltip added
assert "#T_ #T__row0_col0:hover .pd-t {\n visibility: visible;\n}" in result
assert '#T_ #T__row0_col0 .pd-t::after {\n content: "Min";\n}' in result
assert 'class="data row0 col0" >0<span class="pd-t"></span></td>' in result
# test 'Max' tooltip added
assert "#T_ #T__row0_col2:hover .pd-t {\n visibility: visible;\n}" in result
assert '#T_ #T__row0_col2 .pd-t::after {\n content: "Max";\n}' in result
assert 'class="data row0 col2" >2<span class="pd-t"></span></td>' in result
# test Nan, empty string and bad column ignored
assert "#T_ #T__row1_col0:hover .pd-t {\n visibility: visible;\n}" not in result
assert "#T_ #T__row1_col1:hover .pd-t {\n visibility: visible;\n}" not in result
assert "#T_ #T__row0_col1:hover .pd-t {\n visibility: visible;\n}" not in result
assert "#T_ #T__row1_col2:hover .pd-t {\n visibility: visible;\n}" not in result
assert "Bad-Col" not in result
def test_tooltip_ignored(styler):
# GH 21266
result = styler.to_html() # no set_tooltips() creates no <span>
assert '<style type="text/css">\n</style>' in result
assert '<span class="pd-t"></span>' not in result
def test_tooltip_css_class(styler):
# GH 21266
result = styler.set_tooltips(
DataFrame([["tooltip"]], index=["x"], columns=["A"]),
css_class="other-class",
props=[("color", "green")],
).to_html()
assert "#T_ .other-class {\n color: green;\n" in result
assert '#T_ #T__row0_col0 .other-class::after {\n content: "tooltip";\n' in result
# GH 39563
result = styler.set_tooltips( # set_tooltips overwrites previous
DataFrame([["tooltip"]], index=["x"], columns=["A"]),
css_class="another-class",
props="color:green;color:red;",
).to_html()
assert "#T_ .another-class {\n color: green;\n color: red;\n}" in result

View File

@@ -0,0 +1,72 @@
import locale
import pytest
from pandas._config import detect_console_encoding
class MockEncoding:
"""
Used to add a side effect when accessing the 'encoding' property. If the
side effect is a str in nature, the value will be returned. Otherwise, the
side effect should be an exception that will be raised.
"""
def __init__(self, encoding) -> None:
super().__init__()
self.val = encoding
@property
def encoding(self):
return self.raise_or_return(self.val)
@staticmethod
def raise_or_return(val):
if isinstance(val, str):
return val
else:
raise val
@pytest.mark.parametrize("empty,filled", [["stdin", "stdout"], ["stdout", "stdin"]])
def test_detect_console_encoding_from_stdout_stdin(monkeypatch, empty, filled):
# Ensures that when sys.stdout.encoding or sys.stdin.encoding is used when
# they have values filled.
# GH 21552
with monkeypatch.context() as context:
context.setattr(f"sys.{empty}", MockEncoding(""))
context.setattr(f"sys.{filled}", MockEncoding(filled))
assert detect_console_encoding() == filled
@pytest.mark.parametrize("encoding", [AttributeError, OSError, "ascii"])
def test_detect_console_encoding_fallback_to_locale(monkeypatch, encoding):
# GH 21552
with monkeypatch.context() as context:
context.setattr("locale.getpreferredencoding", lambda: "foo")
context.setattr("sys.stdout", MockEncoding(encoding))
assert detect_console_encoding() == "foo"
@pytest.mark.parametrize(
"std,locale",
[
["ascii", "ascii"],
["ascii", locale.Error],
[AttributeError, "ascii"],
[AttributeError, locale.Error],
[OSError, "ascii"],
[OSError, locale.Error],
],
)
def test_detect_console_encoding_fallback_to_default(monkeypatch, std, locale):
# When both the stdout/stdin encoding and locale preferred encoding checks
# fail (or return 'ascii', we should default to the sys default encoding.
# GH 21552
with monkeypatch.context() as context:
context.setattr(
"locale.getpreferredencoding", lambda: MockEncoding.raise_or_return(locale)
)
context.setattr("sys.stdout", MockEncoding(std))
context.setattr("sys.getdefaultencoding", lambda: "sysDefaultEncoding")
assert detect_console_encoding() == "sysDefaultEncoding"

View File

@@ -0,0 +1,289 @@
import pytest
from pandas.errors import CSSWarning
import pandas._testing as tm
from pandas.io.formats.css import CSSResolver
def assert_resolves(css, props, inherited=None):
resolve = CSSResolver()
actual = resolve(css, inherited=inherited)
assert props == actual
def assert_same_resolution(css1, css2, inherited=None):
resolve = CSSResolver()
resolved1 = resolve(css1, inherited=inherited)
resolved2 = resolve(css2, inherited=inherited)
assert resolved1 == resolved2
@pytest.mark.parametrize(
"name,norm,abnorm",
[
(
"whitespace",
"hello: world; foo: bar",
" \t hello \t :\n world \n ; \n foo: \tbar\n\n",
),
("case", "hello: world; foo: bar", "Hello: WORLD; foO: bar"),
("empty-decl", "hello: world; foo: bar", "; hello: world;; foo: bar;\n; ;"),
("empty-list", "", ";"),
],
)
def test_css_parse_normalisation(name, norm, abnorm):
assert_same_resolution(norm, abnorm)
@pytest.mark.parametrize(
"invalid_css,remainder",
[
# No colon
("hello-world", ""),
("border-style: solid; hello-world", "border-style: solid"),
(
"border-style: solid; hello-world; font-weight: bold",
"border-style: solid; font-weight: bold",
),
# Unclosed string fail
# Invalid size
("font-size: blah", "font-size: 1em"),
("font-size: 1a2b", "font-size: 1em"),
("font-size: 1e5pt", "font-size: 1em"),
("font-size: 1+6pt", "font-size: 1em"),
("font-size: 1unknownunit", "font-size: 1em"),
("font-size: 10", "font-size: 1em"),
("font-size: 10 pt", "font-size: 1em"),
# Too many args
("border-top: 1pt solid red green", "border-top: 1pt solid green"),
],
)
def test_css_parse_invalid(invalid_css, remainder):
with tm.assert_produces_warning(CSSWarning):
assert_same_resolution(invalid_css, remainder)
@pytest.mark.parametrize(
"shorthand,expansions",
[
("margin", ["margin-top", "margin-right", "margin-bottom", "margin-left"]),
("padding", ["padding-top", "padding-right", "padding-bottom", "padding-left"]),
(
"border-width",
[
"border-top-width",
"border-right-width",
"border-bottom-width",
"border-left-width",
],
),
(
"border-color",
[
"border-top-color",
"border-right-color",
"border-bottom-color",
"border-left-color",
],
),
(
"border-style",
[
"border-top-style",
"border-right-style",
"border-bottom-style",
"border-left-style",
],
),
],
)
def test_css_side_shorthands(shorthand, expansions):
top, right, bottom, left = expansions
assert_resolves(
f"{shorthand}: 1pt", {top: "1pt", right: "1pt", bottom: "1pt", left: "1pt"}
)
assert_resolves(
f"{shorthand}: 1pt 4pt", {top: "1pt", right: "4pt", bottom: "1pt", left: "4pt"}
)
assert_resolves(
f"{shorthand}: 1pt 4pt 2pt",
{top: "1pt", right: "4pt", bottom: "2pt", left: "4pt"},
)
assert_resolves(
f"{shorthand}: 1pt 4pt 2pt 0pt",
{top: "1pt", right: "4pt", bottom: "2pt", left: "0pt"},
)
with tm.assert_produces_warning(CSSWarning):
assert_resolves(f"{shorthand}: 1pt 1pt 1pt 1pt 1pt", {})
@pytest.mark.parametrize(
"shorthand,sides",
[
("border-top", ["top"]),
("border-right", ["right"]),
("border-bottom", ["bottom"]),
("border-left", ["left"]),
("border", ["top", "right", "bottom", "left"]),
],
)
def test_css_border_shorthand_sides(shorthand, sides):
def create_border_dict(sides, color=None, style=None, width=None):
resolved = {}
for side in sides:
if color:
resolved[f"border-{side}-color"] = color
if style:
resolved[f"border-{side}-style"] = style
if width:
resolved[f"border-{side}-width"] = width
return resolved
assert_resolves(
f"{shorthand}: 1pt red solid", create_border_dict(sides, "red", "solid", "1pt")
)
@pytest.mark.parametrize(
"prop, expected",
[
("1pt red solid", ("red", "solid", "1pt")),
("red 1pt solid", ("red", "solid", "1pt")),
("red solid 1pt", ("red", "solid", "1pt")),
("solid 1pt red", ("red", "solid", "1pt")),
("red solid", ("red", "solid", "1.500000pt")),
# Note: color=black is not CSS conforming
# (See https://drafts.csswg.org/css-backgrounds/#border-shorthands)
("1pt solid", ("black", "solid", "1pt")),
("1pt red", ("red", "none", "1pt")),
("red", ("red", "none", "1.500000pt")),
("1pt", ("black", "none", "1pt")),
("solid", ("black", "solid", "1.500000pt")),
# Sizes
("1em", ("black", "none", "12pt")),
],
)
def test_css_border_shorthands(prop, expected):
color, style, width = expected
assert_resolves(
f"border-left: {prop}",
{
"border-left-color": color,
"border-left-style": style,
"border-left-width": width,
},
)
@pytest.mark.parametrize(
"style,inherited,equiv",
[
("margin: 1px; margin: 2px", "", "margin: 2px"),
("margin: 1px", "margin: 2px", "margin: 1px"),
("margin: 1px; margin: inherit", "margin: 2px", "margin: 2px"),
(
"margin: 1px; margin-top: 2px",
"",
"margin-left: 1px; margin-right: 1px; "
"margin-bottom: 1px; margin-top: 2px",
),
("margin-top: 2px", "margin: 1px", "margin: 1px; margin-top: 2px"),
("margin: 1px", "margin-top: 2px", "margin: 1px"),
(
"margin: 1px; margin-top: inherit",
"margin: 2px",
"margin: 1px; margin-top: 2px",
),
],
)
def test_css_precedence(style, inherited, equiv):
resolve = CSSResolver()
inherited_props = resolve(inherited)
style_props = resolve(style, inherited=inherited_props)
equiv_props = resolve(equiv)
assert style_props == equiv_props
@pytest.mark.parametrize(
"style,equiv",
[
(
"margin: 1px; margin-top: inherit",
"margin-bottom: 1px; margin-right: 1px; margin-left: 1px",
),
("margin-top: inherit", ""),
("margin-top: initial", ""),
],
)
def test_css_none_absent(style, equiv):
assert_same_resolution(style, equiv)
@pytest.mark.parametrize(
"size,resolved",
[
("xx-small", "6pt"),
("x-small", f"{7.5:f}pt"),
("small", f"{9.6:f}pt"),
("medium", "12pt"),
("large", f"{13.5:f}pt"),
("x-large", "18pt"),
("xx-large", "24pt"),
("8px", "6pt"),
("1.25pc", "15pt"),
(".25in", "18pt"),
("02.54cm", "72pt"),
("25.4mm", "72pt"),
("101.6q", "72pt"),
("101.6q", "72pt"),
],
)
@pytest.mark.parametrize("relative_to", [None, "16pt"]) # invariant to inherited size
def test_css_absolute_font_size(size, relative_to, resolved):
if relative_to is None:
inherited = None
else:
inherited = {"font-size": relative_to}
assert_resolves(f"font-size: {size}", {"font-size": resolved}, inherited=inherited)
@pytest.mark.parametrize(
"size,relative_to,resolved",
[
("1em", None, "12pt"),
("1.0em", None, "12pt"),
("1.25em", None, "15pt"),
("1em", "16pt", "16pt"),
("1.0em", "16pt", "16pt"),
("1.25em", "16pt", "20pt"),
("1rem", "16pt", "12pt"),
("1.0rem", "16pt", "12pt"),
("1.25rem", "16pt", "15pt"),
("100%", None, "12pt"),
("125%", None, "15pt"),
("100%", "16pt", "16pt"),
("125%", "16pt", "20pt"),
("2ex", None, "12pt"),
("2.0ex", None, "12pt"),
("2.50ex", None, "15pt"),
("inherit", "16pt", "16pt"),
("smaller", None, "10pt"),
("smaller", "18pt", "15pt"),
("larger", None, f"{14.4:f}pt"),
("larger", "15pt", "18pt"),
],
)
def test_css_relative_font_size(size, relative_to, resolved):
if relative_to is None:
inherited = None
else:
inherited = {"font-size": relative_to}
assert_resolves(f"font-size: {size}", {"font-size": resolved}, inherited=inherited)

View File

@@ -0,0 +1,234 @@
import numpy as np
from pandas import DataFrame
import pandas._testing as tm
import pandas.io.formats.format as fmt
class TestEngFormatter:
def test_eng_float_formatter(self):
df = DataFrame({"A": [1.41, 141.0, 14100, 1410000.0]})
fmt.set_eng_float_format()
result = df.to_string()
expected = (
" A\n"
"0 1.410E+00\n"
"1 141.000E+00\n"
"2 14.100E+03\n"
"3 1.410E+06"
)
assert result == expected
fmt.set_eng_float_format(use_eng_prefix=True)
result = df.to_string()
expected = " A\n0 1.410\n1 141.000\n2 14.100k\n3 1.410M"
assert result == expected
fmt.set_eng_float_format(accuracy=0)
result = df.to_string()
expected = " A\n0 1E+00\n1 141E+00\n2 14E+03\n3 1E+06"
assert result == expected
tm.reset_display_options()
def compare(self, formatter, input, output):
formatted_input = formatter(input)
assert formatted_input == output
def compare_all(self, formatter, in_out):
"""
Parameters:
-----------
formatter: EngFormatter under test
in_out: list of tuples. Each tuple = (number, expected_formatting)
It is tested if 'formatter(number) == expected_formatting'.
*number* should be >= 0 because formatter(-number) == fmt is also
tested. *fmt* is derived from *expected_formatting*
"""
for input, output in in_out:
self.compare(formatter, input, output)
self.compare(formatter, -input, "-" + output[1:])
def test_exponents_with_eng_prefix(self):
formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True)
f = np.sqrt(2)
in_out = [
(f * 10**-24, " 1.414y"),
(f * 10**-23, " 14.142y"),
(f * 10**-22, " 141.421y"),
(f * 10**-21, " 1.414z"),
(f * 10**-20, " 14.142z"),
(f * 10**-19, " 141.421z"),
(f * 10**-18, " 1.414a"),
(f * 10**-17, " 14.142a"),
(f * 10**-16, " 141.421a"),
(f * 10**-15, " 1.414f"),
(f * 10**-14, " 14.142f"),
(f * 10**-13, " 141.421f"),
(f * 10**-12, " 1.414p"),
(f * 10**-11, " 14.142p"),
(f * 10**-10, " 141.421p"),
(f * 10**-9, " 1.414n"),
(f * 10**-8, " 14.142n"),
(f * 10**-7, " 141.421n"),
(f * 10**-6, " 1.414u"),
(f * 10**-5, " 14.142u"),
(f * 10**-4, " 141.421u"),
(f * 10**-3, " 1.414m"),
(f * 10**-2, " 14.142m"),
(f * 10**-1, " 141.421m"),
(f * 10**0, " 1.414"),
(f * 10**1, " 14.142"),
(f * 10**2, " 141.421"),
(f * 10**3, " 1.414k"),
(f * 10**4, " 14.142k"),
(f * 10**5, " 141.421k"),
(f * 10**6, " 1.414M"),
(f * 10**7, " 14.142M"),
(f * 10**8, " 141.421M"),
(f * 10**9, " 1.414G"),
(f * 10**10, " 14.142G"),
(f * 10**11, " 141.421G"),
(f * 10**12, " 1.414T"),
(f * 10**13, " 14.142T"),
(f * 10**14, " 141.421T"),
(f * 10**15, " 1.414P"),
(f * 10**16, " 14.142P"),
(f * 10**17, " 141.421P"),
(f * 10**18, " 1.414E"),
(f * 10**19, " 14.142E"),
(f * 10**20, " 141.421E"),
(f * 10**21, " 1.414Z"),
(f * 10**22, " 14.142Z"),
(f * 10**23, " 141.421Z"),
(f * 10**24, " 1.414Y"),
(f * 10**25, " 14.142Y"),
(f * 10**26, " 141.421Y"),
]
self.compare_all(formatter, in_out)
def test_exponents_without_eng_prefix(self):
formatter = fmt.EngFormatter(accuracy=4, use_eng_prefix=False)
f = np.pi
in_out = [
(f * 10**-24, " 3.1416E-24"),
(f * 10**-23, " 31.4159E-24"),
(f * 10**-22, " 314.1593E-24"),
(f * 10**-21, " 3.1416E-21"),
(f * 10**-20, " 31.4159E-21"),
(f * 10**-19, " 314.1593E-21"),
(f * 10**-18, " 3.1416E-18"),
(f * 10**-17, " 31.4159E-18"),
(f * 10**-16, " 314.1593E-18"),
(f * 10**-15, " 3.1416E-15"),
(f * 10**-14, " 31.4159E-15"),
(f * 10**-13, " 314.1593E-15"),
(f * 10**-12, " 3.1416E-12"),
(f * 10**-11, " 31.4159E-12"),
(f * 10**-10, " 314.1593E-12"),
(f * 10**-9, " 3.1416E-09"),
(f * 10**-8, " 31.4159E-09"),
(f * 10**-7, " 314.1593E-09"),
(f * 10**-6, " 3.1416E-06"),
(f * 10**-5, " 31.4159E-06"),
(f * 10**-4, " 314.1593E-06"),
(f * 10**-3, " 3.1416E-03"),
(f * 10**-2, " 31.4159E-03"),
(f * 10**-1, " 314.1593E-03"),
(f * 10**0, " 3.1416E+00"),
(f * 10**1, " 31.4159E+00"),
(f * 10**2, " 314.1593E+00"),
(f * 10**3, " 3.1416E+03"),
(f * 10**4, " 31.4159E+03"),
(f * 10**5, " 314.1593E+03"),
(f * 10**6, " 3.1416E+06"),
(f * 10**7, " 31.4159E+06"),
(f * 10**8, " 314.1593E+06"),
(f * 10**9, " 3.1416E+09"),
(f * 10**10, " 31.4159E+09"),
(f * 10**11, " 314.1593E+09"),
(f * 10**12, " 3.1416E+12"),
(f * 10**13, " 31.4159E+12"),
(f * 10**14, " 314.1593E+12"),
(f * 10**15, " 3.1416E+15"),
(f * 10**16, " 31.4159E+15"),
(f * 10**17, " 314.1593E+15"),
(f * 10**18, " 3.1416E+18"),
(f * 10**19, " 31.4159E+18"),
(f * 10**20, " 314.1593E+18"),
(f * 10**21, " 3.1416E+21"),
(f * 10**22, " 31.4159E+21"),
(f * 10**23, " 314.1593E+21"),
(f * 10**24, " 3.1416E+24"),
(f * 10**25, " 31.4159E+24"),
(f * 10**26, " 314.1593E+24"),
]
self.compare_all(formatter, in_out)
def test_rounding(self):
formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True)
in_out = [
(5.55555, " 5.556"),
(55.5555, " 55.556"),
(555.555, " 555.555"),
(5555.55, " 5.556k"),
(55555.5, " 55.556k"),
(555555, " 555.555k"),
]
self.compare_all(formatter, in_out)
formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True)
in_out = [
(5.55555, " 5.6"),
(55.5555, " 55.6"),
(555.555, " 555.6"),
(5555.55, " 5.6k"),
(55555.5, " 55.6k"),
(555555, " 555.6k"),
]
self.compare_all(formatter, in_out)
formatter = fmt.EngFormatter(accuracy=0, use_eng_prefix=True)
in_out = [
(5.55555, " 6"),
(55.5555, " 56"),
(555.555, " 556"),
(5555.55, " 6k"),
(55555.5, " 56k"),
(555555, " 556k"),
]
self.compare_all(formatter, in_out)
formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True)
result = formatter(0)
assert result == " 0.000"
def test_nan(self):
# Issue #11981
formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True)
result = formatter(np.nan)
assert result == "NaN"
df = DataFrame(
{
"a": [1.5, 10.3, 20.5],
"b": [50.3, 60.67, 70.12],
"c": [100.2, 101.33, 120.33],
}
)
pt = df.pivot_table(values="a", index="b", columns="c")
fmt.set_eng_float_format(accuracy=1)
result = pt.to_string()
assert "NaN" in result
tm.reset_display_options()
def test_inf(self):
# Issue #11981
formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True)
result = formatter(np.inf)
assert result == "inf"

View File

@@ -0,0 +1,527 @@
from io import StringIO
import re
from string import ascii_uppercase as uppercase
import sys
import textwrap
import numpy as np
import pytest
from pandas.compat import (
IS64,
PYPY,
)
from pandas import (
CategoricalIndex,
DataFrame,
MultiIndex,
Series,
date_range,
option_context,
)
import pandas._testing as tm
@pytest.fixture
def duplicate_columns_frame():
"""Dataframe with duplicate column names."""
return DataFrame(
np.random.default_rng(2).standard_normal((1500, 4)),
columns=["a", "a", "b", "b"],
)
def test_info_empty():
# GH #45494
df = DataFrame()
buf = StringIO()
df.info(buf=buf)
result = buf.getvalue()
expected = textwrap.dedent(
"""\
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame\n"""
)
assert result == expected
def test_info_categorical_column_smoke_test():
n = 2500
df = DataFrame({"int64": np.random.default_rng(2).integers(100, size=n, dtype=int)})
df["category"] = Series(
np.array(list("abcdefghij")).take(
np.random.default_rng(2).integers(0, 10, size=n, dtype=int)
)
).astype("category")
df.isna()
buf = StringIO()
df.info(buf=buf)
df2 = df[df["category"] == "d"]
buf = StringIO()
df2.info(buf=buf)
@pytest.mark.parametrize(
"fixture_func_name",
[
"int_frame",
"float_frame",
"datetime_frame",
"duplicate_columns_frame",
],
)
def test_info_smoke_test(fixture_func_name, request):
frame = request.getfixturevalue(fixture_func_name)
buf = StringIO()
frame.info(buf=buf)
result = buf.getvalue().splitlines()
assert len(result) > 10
@pytest.mark.parametrize(
"num_columns, max_info_columns, verbose",
[
(10, 100, True),
(10, 11, True),
(10, 10, True),
(10, 9, False),
(10, 1, False),
],
)
def test_info_default_verbose_selection(num_columns, max_info_columns, verbose):
frame = DataFrame(np.random.default_rng(2).standard_normal((5, num_columns)))
with option_context("display.max_info_columns", max_info_columns):
io_default = StringIO()
frame.info(buf=io_default)
result = io_default.getvalue()
io_explicit = StringIO()
frame.info(buf=io_explicit, verbose=verbose)
expected = io_explicit.getvalue()
assert result == expected
def test_info_verbose_check_header_separator_body():
buf = StringIO()
size = 1001
start = 5
frame = DataFrame(np.random.default_rng(2).standard_normal((3, size)))
frame.info(verbose=True, buf=buf)
res = buf.getvalue()
header = " # Column Dtype \n--- ------ ----- "
assert header in res
frame.info(verbose=True, buf=buf)
buf.seek(0)
lines = buf.readlines()
assert len(lines) > 0
for i, line in enumerate(lines):
if start <= i < start + size:
line_nr = f" {i - start} "
assert line.startswith(line_nr)
@pytest.mark.parametrize(
"size, header_exp, separator_exp, first_line_exp, last_line_exp",
[
(
4,
" # Column Non-Null Count Dtype ",
"--- ------ -------------- ----- ",
" 0 0 3 non-null float64",
" 3 3 3 non-null float64",
),
(
11,
" # Column Non-Null Count Dtype ",
"--- ------ -------------- ----- ",
" 0 0 3 non-null float64",
" 10 10 3 non-null float64",
),
(
101,
" # Column Non-Null Count Dtype ",
"--- ------ -------------- ----- ",
" 0 0 3 non-null float64",
" 100 100 3 non-null float64",
),
(
1001,
" # Column Non-Null Count Dtype ",
"--- ------ -------------- ----- ",
" 0 0 3 non-null float64",
" 1000 1000 3 non-null float64",
),
(
10001,
" # Column Non-Null Count Dtype ",
"--- ------ -------------- ----- ",
" 0 0 3 non-null float64",
" 10000 10000 3 non-null float64",
),
],
)
def test_info_verbose_with_counts_spacing(
size, header_exp, separator_exp, first_line_exp, last_line_exp
):
"""Test header column, spacer, first line and last line in verbose mode."""
frame = DataFrame(np.random.default_rng(2).standard_normal((3, size)))
with StringIO() as buf:
frame.info(verbose=True, show_counts=True, buf=buf)
all_lines = buf.getvalue().splitlines()
# Here table would contain only header, separator and table lines
# dframe repr, index summary, memory usage and dtypes are excluded
table = all_lines[3:-2]
header, separator, first_line, *rest, last_line = table
assert header == header_exp
assert separator == separator_exp
assert first_line == first_line_exp
assert last_line == last_line_exp
def test_info_memory():
# https://github.com/pandas-dev/pandas/issues/21056
df = DataFrame({"a": Series([1, 2], dtype="i8")})
buf = StringIO()
df.info(buf=buf)
result = buf.getvalue()
bytes = float(df.memory_usage().sum())
expected = textwrap.dedent(
f"""\
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 1 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 a 2 non-null int64
dtypes: int64(1)
memory usage: {bytes} bytes
"""
)
assert result == expected
def test_info_wide():
io = StringIO()
df = DataFrame(np.random.default_rng(2).standard_normal((5, 101)))
df.info(buf=io)
io = StringIO()
df.info(buf=io, max_cols=101)
result = io.getvalue()
assert len(result.splitlines()) > 100
expected = result
with option_context("display.max_info_columns", 101):
io = StringIO()
df.info(buf=io)
result = io.getvalue()
assert result == expected
def test_info_duplicate_columns_shows_correct_dtypes():
# GH11761
io = StringIO()
frame = DataFrame([[1, 2.0]], columns=["a", "a"])
frame.info(buf=io)
lines = io.getvalue().splitlines(True)
assert " 0 a 1 non-null int64 \n" == lines[5]
assert " 1 a 1 non-null float64\n" == lines[6]
def test_info_shows_column_dtypes():
dtypes = [
"int64",
"float64",
"datetime64[ns]",
"timedelta64[ns]",
"complex128",
"object",
"bool",
]
data = {}
n = 10
for i, dtype in enumerate(dtypes):
data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype)
df = DataFrame(data)
buf = StringIO()
df.info(buf=buf)
res = buf.getvalue()
header = (
" # Column Non-Null Count Dtype \n"
"--- ------ -------------- ----- "
)
assert header in res
for i, dtype in enumerate(dtypes):
name = f" {i:d} {i:d} {n:d} non-null {dtype}"
assert name in res
def test_info_max_cols():
df = DataFrame(np.random.default_rng(2).standard_normal((10, 5)))
for len_, verbose in [(5, None), (5, False), (12, True)]:
# For verbose always ^ setting ^ summarize ^ full output
with option_context("max_info_columns", 4):
buf = StringIO()
df.info(buf=buf, verbose=verbose)
res = buf.getvalue()
assert len(res.strip().split("\n")) == len_
for len_, verbose in [(12, None), (5, False), (12, True)]:
# max_cols not exceeded
with option_context("max_info_columns", 5):
buf = StringIO()
df.info(buf=buf, verbose=verbose)
res = buf.getvalue()
assert len(res.strip().split("\n")) == len_
for len_, max_cols in [(12, 5), (5, 4)]:
# setting truncates
with option_context("max_info_columns", 4):
buf = StringIO()
df.info(buf=buf, max_cols=max_cols)
res = buf.getvalue()
assert len(res.strip().split("\n")) == len_
# setting wouldn't truncate
with option_context("max_info_columns", 5):
buf = StringIO()
df.info(buf=buf, max_cols=max_cols)
res = buf.getvalue()
assert len(res.strip().split("\n")) == len_
def test_info_memory_usage():
# Ensure memory usage is displayed, when asserted, on the last line
dtypes = [
"int64",
"float64",
"datetime64[ns]",
"timedelta64[ns]",
"complex128",
"object",
"bool",
]
data = {}
n = 10
for i, dtype in enumerate(dtypes):
data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype)
df = DataFrame(data)
buf = StringIO()
# display memory usage case
df.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
assert "memory usage: " in res[-1]
# do not display memory usage case
df.info(buf=buf, memory_usage=False)
res = buf.getvalue().splitlines()
assert "memory usage: " not in res[-1]
df.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
# memory usage is a lower bound, so print it as XYZ+ MB
assert re.match(r"memory usage: [^+]+\+", res[-1])
df.iloc[:, :5].info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
# excluded column with object dtype, so estimate is accurate
assert not re.match(r"memory usage: [^+]+\+", res[-1])
# Test a DataFrame with duplicate columns
dtypes = ["int64", "int64", "int64", "float64"]
data = {}
n = 100
for i, dtype in enumerate(dtypes):
data[i] = np.random.default_rng(2).integers(2, size=n).astype(dtype)
df = DataFrame(data)
df.columns = dtypes
df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
df_with_object_index.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+\+", res[-1])
df_with_object_index.info(buf=buf, memory_usage="deep")
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+$", res[-1])
# Ensure df size is as expected
# (cols * rows * bytes) + index size
df_size = df.memory_usage().sum()
exp_size = len(dtypes) * n * 8 + df.index.nbytes
assert df_size == exp_size
# Ensure number of cols in memory_usage is the same as df
size_df = np.size(df.columns.values) + 1 # index=True; default
assert size_df == np.size(df.memory_usage())
# assert deep works only on object
assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()
# test for validity
DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True)
DataFrame(1, index=["a"], columns=["A"]).index.nbytes
df = DataFrame(
data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"]
)
df.index.nbytes
df.memory_usage(index=True)
df.index.values.nbytes
mem = df.memory_usage(deep=True).sum()
assert mem > 0
@pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result")
def test_info_memory_usage_deep_not_pypy():
df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
assert (
df_with_object_index.memory_usage(index=True, deep=True).sum()
> df_with_object_index.memory_usage(index=True).sum()
)
df_object = DataFrame({"a": ["a"]})
assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum()
@pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result")
def test_info_memory_usage_deep_pypy():
df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
assert (
df_with_object_index.memory_usage(index=True, deep=True).sum()
== df_with_object_index.memory_usage(index=True).sum()
)
df_object = DataFrame({"a": ["a"]})
assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum()
@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
def test_usage_via_getsizeof():
df = DataFrame(
data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"]
)
mem = df.memory_usage(deep=True).sum()
# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = mem - sys.getsizeof(df)
assert abs(diff) < 100
def test_info_memory_usage_qualified():
buf = StringIO()
df = DataFrame(1, columns=list("ab"), index=[1, 2, 3])
df.info(buf=buf)
assert "+" not in buf.getvalue()
buf = StringIO()
df = DataFrame(1, columns=list("ab"), index=list("ABC"))
df.info(buf=buf)
assert "+" in buf.getvalue()
buf = StringIO()
df = DataFrame(
1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)])
)
df.info(buf=buf)
assert "+" not in buf.getvalue()
buf = StringIO()
df = DataFrame(
1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]])
)
df.info(buf=buf)
assert "+" in buf.getvalue()
def test_info_memory_usage_bug_on_multiindex():
# GH 14308
# memory usage introspection should not materialize .values
def memory_usage(f):
return f.memory_usage(deep=True).sum()
N = 100
M = len(uppercase)
index = MultiIndex.from_product(
[list(uppercase), date_range("20160101", periods=N)],
names=["id", "date"],
)
df = DataFrame(
{"value": np.random.default_rng(2).standard_normal(N * M)}, index=index
)
unstacked = df.unstack("id")
assert df.values.nbytes == unstacked.values.nbytes
assert memory_usage(df) > memory_usage(unstacked)
# high upper bound
assert memory_usage(unstacked) - memory_usage(df) < 2000
def test_info_categorical():
# GH14298
idx = CategoricalIndex(["a", "b"])
df = DataFrame(np.zeros((2, 2)), index=idx, columns=idx)
buf = StringIO()
df.info(buf=buf)
@pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system")
def test_info_int_columns():
# GH#37245
df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"])
buf = StringIO()
df.info(show_counts=True, buf=buf)
result = buf.getvalue()
expected = textwrap.dedent(
"""\
<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, A to B
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 1 2 non-null int64
1 2 2 non-null int64
dtypes: int64(2)
memory usage: 48.0+ bytes
"""
)
assert result == expected
def test_memory_usage_empty_no_warning():
# GH#50066
df = DataFrame(index=["a", "b"])
with tm.assert_produces_warning(None):
result = df.memory_usage()
expected = Series(16 if IS64 else 8, index=["Index"])
tm.assert_series_equal(result, expected)
@pytest.mark.single_cpu
def test_info_compute_numba():
# GH#51922
pytest.importorskip("numba")
df = DataFrame([[1, 2], [3, 4]])
with option_context("compute.use_numba", True):
buf = StringIO()
df.info()
result = buf.getvalue()
buf = StringIO()
df.info()
expected = buf.getvalue()
assert result == expected

View File

@@ -0,0 +1,248 @@
import string
import numpy as np
import pytest
import pandas._config.config as cf
import pandas as pd
from pandas.io.formats import printing
import pandas.io.formats.format as fmt
def test_adjoin():
data = [["a", "b", "c"], ["dd", "ee", "ff"], ["ggg", "hhh", "iii"]]
expected = "a dd ggg\nb ee hhh\nc ff iii"
adjoined = printing.adjoin(2, *data)
assert adjoined == expected
def test_repr_binary_type():
letters = string.ascii_letters
try:
raw = bytes(letters, encoding=cf.get_option("display.encoding"))
except TypeError:
raw = bytes(letters)
b = str(raw.decode("utf-8"))
res = printing.pprint_thing(b, quote_strings=True)
assert res == repr(b)
res = printing.pprint_thing(b, quote_strings=False)
assert res == b
class TestFormattBase:
def test_adjoin(self):
data = [["a", "b", "c"], ["dd", "ee", "ff"], ["ggg", "hhh", "iii"]]
expected = "a dd ggg\nb ee hhh\nc ff iii"
adjoined = printing.adjoin(2, *data)
assert adjoined == expected
def test_adjoin_unicode(self):
data = [["", "b", "c"], ["dd", "ええ", "ff"], ["ggg", "hhh", "いいい"]]
expected = "あ dd ggg\nb ええ hhh\nc ff いいい"
adjoined = printing.adjoin(2, *data)
assert adjoined == expected
adj = fmt.EastAsianTextAdjustment()
expected = """あ dd ggg
b ええ hhh
c ff いいい"""
adjoined = adj.adjoin(2, *data)
assert adjoined == expected
cols = adjoined.split("\n")
assert adj.len(cols[0]) == 13
assert adj.len(cols[1]) == 13
assert adj.len(cols[2]) == 16
expected = """あ dd ggg
b ええ hhh
c ff いいい"""
adjoined = adj.adjoin(7, *data)
assert adjoined == expected
cols = adjoined.split("\n")
assert adj.len(cols[0]) == 23
assert adj.len(cols[1]) == 23
assert adj.len(cols[2]) == 26
def test_justify(self):
adj = fmt.EastAsianTextAdjustment()
def just(x, *args, **kwargs):
# wrapper to test single str
return adj.justify([x], *args, **kwargs)[0]
assert just("abc", 5, mode="left") == "abc "
assert just("abc", 5, mode="center") == " abc "
assert just("abc", 5, mode="right") == " abc"
assert just("abc", 5, mode="left") == "abc "
assert just("abc", 5, mode="center") == " abc "
assert just("abc", 5, mode="right") == " abc"
assert just("パンダ", 5, mode="left") == "パンダ"
assert just("パンダ", 5, mode="center") == "パンダ"
assert just("パンダ", 5, mode="right") == "パンダ"
assert just("パンダ", 10, mode="left") == "パンダ "
assert just("パンダ", 10, mode="center") == " パンダ "
assert just("パンダ", 10, mode="right") == " パンダ"
def test_east_asian_len(self):
adj = fmt.EastAsianTextAdjustment()
assert adj.len("abc") == 3
assert adj.len("abc") == 3
assert adj.len("パンダ") == 6
assert adj.len("パンダ") == 5
assert adj.len("パンダpanda") == 11
assert adj.len("パンダpanda") == 10
def test_ambiguous_width(self):
adj = fmt.EastAsianTextAdjustment()
assert adj.len("¡¡ab") == 4
with cf.option_context("display.unicode.ambiguous_as_wide", True):
adj = fmt.EastAsianTextAdjustment()
assert adj.len("¡¡ab") == 6
data = [["", "b", "c"], ["dd", "ええ", "ff"], ["ggg", "¡¡ab", "いいい"]]
expected = "あ dd ggg \nb ええ ¡¡ab\nc ff いいい"
adjoined = adj.adjoin(2, *data)
assert adjoined == expected
class TestTableSchemaRepr:
def test_publishes(self, ip):
ipython = ip.instance(config=ip.config)
df = pd.DataFrame({"A": [1, 2]})
objects = [df["A"], df] # dataframe / series
expected_keys = [
{"text/plain", "application/vnd.dataresource+json"},
{"text/plain", "text/html", "application/vnd.dataresource+json"},
]
opt = pd.option_context("display.html.table_schema", True)
last_obj = None
for obj, expected in zip(objects, expected_keys):
last_obj = obj
with opt:
formatted = ipython.display_formatter.format(obj)
assert set(formatted[0].keys()) == expected
with_latex = pd.option_context("styler.render.repr", "latex")
with opt, with_latex:
formatted = ipython.display_formatter.format(last_obj)
expected = {
"text/plain",
"text/html",
"text/latex",
"application/vnd.dataresource+json",
}
assert set(formatted[0].keys()) == expected
def test_publishes_not_implemented(self, ip):
# column MultiIndex
# GH 15996
midx = pd.MultiIndex.from_product([["A", "B"], ["a", "b", "c"]])
df = pd.DataFrame(
np.random.default_rng(2).standard_normal((5, len(midx))), columns=midx
)
opt = pd.option_context("display.html.table_schema", True)
with opt:
formatted = ip.instance(config=ip.config).display_formatter.format(df)
expected = {"text/plain", "text/html"}
assert set(formatted[0].keys()) == expected
def test_config_on(self):
df = pd.DataFrame({"A": [1, 2]})
with pd.option_context("display.html.table_schema", True):
result = df._repr_data_resource_()
assert result is not None
def test_config_default_off(self):
df = pd.DataFrame({"A": [1, 2]})
with pd.option_context("display.html.table_schema", False):
result = df._repr_data_resource_()
assert result is None
def test_enable_data_resource_formatter(self, ip):
# GH 10491
formatters = ip.instance(config=ip.config).display_formatter.formatters
mimetype = "application/vnd.dataresource+json"
with pd.option_context("display.html.table_schema", True):
assert "application/vnd.dataresource+json" in formatters
assert formatters[mimetype].enabled
# still there, just disabled
assert "application/vnd.dataresource+json" in formatters
assert not formatters[mimetype].enabled
# able to re-set
with pd.option_context("display.html.table_schema", True):
assert "application/vnd.dataresource+json" in formatters
assert formatters[mimetype].enabled
# smoke test that it works
ip.instance(config=ip.config).display_formatter.format(cf)
def test_multiindex_long_element():
# Non-regression test towards GH #52960
data = pd.MultiIndex.from_tuples([("c" * 62,)])
expected = (
"MultiIndex([('cccccccccccccccccccccccccccccccccccccccc"
"cccccccccccccccccccccc',)],\n )"
)
assert str(data) == expected
@pytest.mark.parametrize(
"data,output",
[
([2, complex("nan"), 1], [" 2.0+0.0j", " NaN+0.0j", " 1.0+0.0j"]),
([2, complex("nan"), -1], [" 2.0+0.0j", " NaN+0.0j", "-1.0+0.0j"]),
([-2, complex("nan"), -1], ["-2.0+0.0j", " NaN+0.0j", "-1.0+0.0j"]),
([-1.23j, complex("nan"), -1], ["-0.00-1.23j", " NaN+0.00j", "-1.00+0.00j"]),
([1.23j, complex("nan"), 1.23], [" 0.00+1.23j", " NaN+0.00j", " 1.23+0.00j"]),
(
[-1.23j, complex(np.nan, np.nan), 1],
["-0.00-1.23j", " NaN+ NaNj", " 1.00+0.00j"],
),
(
[-1.23j, complex(1.2, np.nan), 1],
["-0.00-1.23j", " 1.20+ NaNj", " 1.00+0.00j"],
),
(
[-1.23j, complex(np.nan, -1.2), 1],
["-0.00-1.23j", " NaN-1.20j", " 1.00+0.00j"],
),
],
)
@pytest.mark.parametrize("as_frame", [True, False])
def test_ser_df_with_complex_nans(data, output, as_frame):
# GH#53762, GH#53841
obj = pd.Series(np.array(data))
if as_frame:
obj = obj.to_frame(name="val")
reprs = [f"{i} {val}" for i, val in enumerate(output)]
expected = f"{'val': >{len(reprs[0])}}\n" + "\n".join(reprs)
else:
reprs = [f"{i} {val}" for i, val in enumerate(output)]
expected = "\n".join(reprs) + "\ndtype: complex128"
assert str(obj) == expected, f"\n{str(obj)}\n\n{expected}"

View File

@@ -0,0 +1,181 @@
from io import StringIO
from string import ascii_uppercase as uppercase
import textwrap
import numpy as np
import pytest
from pandas.compat import PYPY
from pandas import (
CategoricalIndex,
MultiIndex,
Series,
date_range,
)
def test_info_categorical_column_just_works():
n = 2500
data = np.array(list("abcdefghij")).take(
np.random.default_rng(2).integers(0, 10, size=n, dtype=int)
)
s = Series(data).astype("category")
s.isna()
buf = StringIO()
s.info(buf=buf)
s2 = s[s == "d"]
buf = StringIO()
s2.info(buf=buf)
def test_info_categorical():
# GH14298
idx = CategoricalIndex(["a", "b"])
s = Series(np.zeros(2), index=idx)
buf = StringIO()
s.info(buf=buf)
@pytest.mark.parametrize("verbose", [True, False])
def test_info_series(lexsorted_two_level_string_multiindex, verbose):
index = lexsorted_two_level_string_multiindex
ser = Series(range(len(index)), index=index, name="sth")
buf = StringIO()
ser.info(verbose=verbose, buf=buf)
result = buf.getvalue()
expected = textwrap.dedent(
"""\
<class 'pandas.core.series.Series'>
MultiIndex: 10 entries, ('foo', 'one') to ('qux', 'three')
"""
)
if verbose:
expected += textwrap.dedent(
"""\
Series name: sth
Non-Null Count Dtype
-------------- -----
10 non-null int64
"""
)
expected += textwrap.dedent(
f"""\
dtypes: int64(1)
memory usage: {ser.memory_usage()}.0+ bytes
"""
)
assert result == expected
def test_info_memory():
s = Series([1, 2], dtype="i8")
buf = StringIO()
s.info(buf=buf)
result = buf.getvalue()
memory_bytes = float(s.memory_usage())
expected = textwrap.dedent(
f"""\
<class 'pandas.core.series.Series'>
RangeIndex: 2 entries, 0 to 1
Series name: None
Non-Null Count Dtype
-------------- -----
2 non-null int64
dtypes: int64(1)
memory usage: {memory_bytes} bytes
"""
)
assert result == expected
def test_info_wide():
s = Series(np.random.default_rng(2).standard_normal(101))
msg = "Argument `max_cols` can only be passed in DataFrame.info, not Series.info"
with pytest.raises(ValueError, match=msg):
s.info(max_cols=1)
def test_info_shows_dtypes():
dtypes = [
"int64",
"float64",
"datetime64[ns]",
"timedelta64[ns]",
"complex128",
"object",
"bool",
]
n = 10
for dtype in dtypes:
s = Series(np.random.default_rng(2).integers(2, size=n).astype(dtype))
buf = StringIO()
s.info(buf=buf)
res = buf.getvalue()
name = f"{n:d} non-null {dtype}"
assert name in res
@pytest.mark.xfail(PYPY, reason="on PyPy deep=True doesn't change result")
def test_info_memory_usage_deep_not_pypy():
s_with_object_index = Series({"a": [1]}, index=["foo"])
assert s_with_object_index.memory_usage(
index=True, deep=True
) > s_with_object_index.memory_usage(index=True)
s_object = Series({"a": ["a"]})
assert s_object.memory_usage(deep=True) > s_object.memory_usage()
@pytest.mark.xfail(not PYPY, reason="on PyPy deep=True does not change result")
def test_info_memory_usage_deep_pypy():
s_with_object_index = Series({"a": [1]}, index=["foo"])
assert s_with_object_index.memory_usage(
index=True, deep=True
) == s_with_object_index.memory_usage(index=True)
s_object = Series({"a": ["a"]})
assert s_object.memory_usage(deep=True) == s_object.memory_usage()
@pytest.mark.parametrize(
"series, plus",
[
(Series(1, index=[1, 2, 3]), False),
(Series(1, index=list("ABC")), True),
(Series(1, index=MultiIndex.from_product([range(3), range(3)])), False),
(
Series(1, index=MultiIndex.from_product([range(3), ["foo", "bar"]])),
True,
),
],
)
def test_info_memory_usage_qualified(series, plus):
buf = StringIO()
series.info(buf=buf)
if plus:
assert "+" in buf.getvalue()
else:
assert "+" not in buf.getvalue()
def test_info_memory_usage_bug_on_multiindex():
# GH 14308
# memory usage introspection should not materialize .values
N = 100
M = len(uppercase)
index = MultiIndex.from_product(
[list(uppercase), date_range("20160101", periods=N)],
names=["id", "date"],
)
s = Series(np.random.default_rng(2).standard_normal(N * M), index=index)
unstacked = s.unstack("id")
assert s.values.nbytes == unstacked.values.nbytes
assert s.memory_usage(deep=True) > unstacked.memory_usage(deep=True).sum()
# high upper bound
diff = unstacked.memory_usage(deep=True).sum() - s.memory_usage(deep=True)
assert diff < 2000

View File

@@ -0,0 +1,733 @@
import io
import os
import sys
from zipfile import ZipFile
from _csv import Error
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
compat,
)
import pandas._testing as tm
class TestToCSV:
def test_to_csv_with_single_column(self):
# see gh-18676, https://bugs.python.org/issue32255
#
# Python's CSV library adds an extraneous '""'
# before the newline when the NaN-value is in
# the first row. Otherwise, only the newline
# character is added. This behavior is inconsistent
# and was patched in https://bugs.python.org/pull_request4672.
df1 = DataFrame([None, 1])
expected1 = """\
""
1.0
"""
with tm.ensure_clean("test.csv") as path:
df1.to_csv(path, header=None, index=None)
with open(path, encoding="utf-8") as f:
assert f.read() == expected1
df2 = DataFrame([1, None])
expected2 = """\
1.0
""
"""
with tm.ensure_clean("test.csv") as path:
df2.to_csv(path, header=None, index=None)
with open(path, encoding="utf-8") as f:
assert f.read() == expected2
def test_to_csv_default_encoding(self):
# GH17097
df = DataFrame({"col": ["AAAAA", "ÄÄÄÄÄ", "ßßßßß", "聞聞聞聞聞"]})
with tm.ensure_clean("test.csv") as path:
# the default to_csv encoding is uft-8.
df.to_csv(path)
tm.assert_frame_equal(pd.read_csv(path, index_col=0), df)
def test_to_csv_quotechar(self):
df = DataFrame({"col": [1, 2]})
expected = """\
"","col"
"0","1"
"1","2"
"""
with tm.ensure_clean("test.csv") as path:
df.to_csv(path, quoting=1) # 1=QUOTE_ALL
with open(path, encoding="utf-8") as f:
assert f.read() == expected
expected = """\
$$,$col$
$0$,$1$
$1$,$2$
"""
with tm.ensure_clean("test.csv") as path:
df.to_csv(path, quoting=1, quotechar="$")
with open(path, encoding="utf-8") as f:
assert f.read() == expected
with tm.ensure_clean("test.csv") as path:
with pytest.raises(TypeError, match="quotechar"):
df.to_csv(path, quoting=1, quotechar=None)
def test_to_csv_doublequote(self):
df = DataFrame({"col": ['a"a', '"bb"']})
expected = '''\
"","col"
"0","a""a"
"1","""bb"""
'''
with tm.ensure_clean("test.csv") as path:
df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL
with open(path, encoding="utf-8") as f:
assert f.read() == expected
with tm.ensure_clean("test.csv") as path:
with pytest.raises(Error, match="escapechar"):
df.to_csv(path, doublequote=False) # no escapechar set
def test_to_csv_escapechar(self):
df = DataFrame({"col": ['a"a', '"bb"']})
expected = """\
"","col"
"0","a\\"a"
"1","\\"bb\\""
"""
with tm.ensure_clean("test.csv") as path: # QUOTE_ALL
df.to_csv(path, quoting=1, doublequote=False, escapechar="\\")
with open(path, encoding="utf-8") as f:
assert f.read() == expected
df = DataFrame({"col": ["a,a", ",bb,"]})
expected = """\
,col
0,a\\,a
1,\\,bb\\,
"""
with tm.ensure_clean("test.csv") as path:
df.to_csv(path, quoting=3, escapechar="\\") # QUOTE_NONE
with open(path, encoding="utf-8") as f:
assert f.read() == expected
def test_csv_to_string(self):
df = DataFrame({"col": [1, 2]})
expected_rows = [",col", "0,1", "1,2"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.to_csv() == expected
def test_to_csv_decimal(self):
# see gh-781
df = DataFrame({"col1": [1], "col2": ["a"], "col3": [10.1]})
expected_rows = [",col1,col2,col3", "0,1,a,10.1"]
expected_default = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.to_csv() == expected_default
expected_rows = [";col1;col2;col3", "0;1;a;10,1"]
expected_european_excel = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.to_csv(decimal=",", sep=";") == expected_european_excel
expected_rows = [",col1,col2,col3", "0,1,a,10.10"]
expected_float_format_default = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.to_csv(float_format="%.2f") == expected_float_format_default
expected_rows = [";col1;col2;col3", "0;1;a;10,10"]
expected_float_format = tm.convert_rows_list_to_csv_str(expected_rows)
assert (
df.to_csv(decimal=",", sep=";", float_format="%.2f")
== expected_float_format
)
# see gh-11553: testing if decimal is taken into account for '0.0'
df = DataFrame({"a": [0, 1.1], "b": [2.2, 3.3], "c": 1})
expected_rows = ["a,b,c", "0^0,2^2,1", "1^1,3^3,1"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.to_csv(index=False, decimal="^") == expected
# same but for an index
assert df.set_index("a").to_csv(decimal="^") == expected
# same for a multi-index
assert df.set_index(["a", "b"]).to_csv(decimal="^") == expected
def test_to_csv_float_format(self):
# testing if float_format is taken into account for the index
# GH 11553
df = DataFrame({"a": [0, 1], "b": [2.2, 3.3], "c": 1})
expected_rows = ["a,b,c", "0,2.20,1", "1,3.30,1"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.set_index("a").to_csv(float_format="%.2f") == expected
# same for a multi-index
assert df.set_index(["a", "b"]).to_csv(float_format="%.2f") == expected
def test_to_csv_na_rep(self):
# see gh-11553
#
# Testing if NaN values are correctly represented in the index.
df = DataFrame({"a": [0, np.nan], "b": [0, 1], "c": [2, 3]})
expected_rows = ["a,b,c", "0.0,0,2", "_,1,3"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.set_index("a").to_csv(na_rep="_") == expected
assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected
# now with an index containing only NaNs
df = DataFrame({"a": np.nan, "b": [0, 1], "c": [2, 3]})
expected_rows = ["a,b,c", "_,0,2", "_,1,3"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.set_index("a").to_csv(na_rep="_") == expected
assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected
# check if na_rep parameter does not break anything when no NaN
df = DataFrame({"a": 0, "b": [0, 1], "c": [2, 3]})
expected_rows = ["a,b,c", "0,0,2", "0,1,3"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.set_index("a").to_csv(na_rep="_") == expected
assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected
csv = pd.Series(["a", pd.NA, "c"]).to_csv(na_rep="ZZZZZ")
expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"])
assert expected == csv
def test_to_csv_na_rep_nullable_string(self, nullable_string_dtype):
# GH 29975
# Make sure full na_rep shows up when a dtype is provided
expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"])
csv = pd.Series(["a", pd.NA, "c"], dtype=nullable_string_dtype).to_csv(
na_rep="ZZZZZ"
)
assert expected == csv
def test_to_csv_date_format(self):
# GH 10209
df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")})
df_day = DataFrame({"A": pd.date_range("20130101", periods=5, freq="d")})
expected_rows = [
",A",
"0,2013-01-01 00:00:00",
"1,2013-01-01 00:00:01",
"2,2013-01-01 00:00:02",
"3,2013-01-01 00:00:03",
"4,2013-01-01 00:00:04",
]
expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows)
assert df_sec.to_csv() == expected_default_sec
expected_rows = [
",A",
"0,2013-01-01 00:00:00",
"1,2013-01-02 00:00:00",
"2,2013-01-03 00:00:00",
"3,2013-01-04 00:00:00",
"4,2013-01-05 00:00:00",
]
expected_ymdhms_day = tm.convert_rows_list_to_csv_str(expected_rows)
assert df_day.to_csv(date_format="%Y-%m-%d %H:%M:%S") == expected_ymdhms_day
expected_rows = [
",A",
"0,2013-01-01",
"1,2013-01-01",
"2,2013-01-01",
"3,2013-01-01",
"4,2013-01-01",
]
expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)
assert df_sec.to_csv(date_format="%Y-%m-%d") == expected_ymd_sec
expected_rows = [
",A",
"0,2013-01-01",
"1,2013-01-02",
"2,2013-01-03",
"3,2013-01-04",
"4,2013-01-05",
]
expected_default_day = tm.convert_rows_list_to_csv_str(expected_rows)
assert df_day.to_csv() == expected_default_day
assert df_day.to_csv(date_format="%Y-%m-%d") == expected_default_day
# see gh-7791
#
# Testing if date_format parameter is taken into account
# for multi-indexed DataFrames.
df_sec["B"] = 0
df_sec["C"] = 1
expected_rows = ["A,B,C", "2013-01-01,0,1.0"]
expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)
df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"])
assert df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d") == expected_ymd_sec
def test_to_csv_different_datetime_formats(self):
# GH#21734
df = DataFrame(
{
"date": pd.to_datetime("1970-01-01"),
"datetime": pd.date_range("1970-01-01", periods=2, freq="H"),
}
)
expected_rows = [
"date,datetime",
"1970-01-01,1970-01-01 00:00:00",
"1970-01-01,1970-01-01 01:00:00",
]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert df.to_csv(index=False) == expected
def test_to_csv_date_format_in_categorical(self):
# GH#40754
ser = pd.Series(pd.to_datetime(["2021-03-27", pd.NaT], format="%Y-%m-%d"))
ser = ser.astype("category")
expected = tm.convert_rows_list_to_csv_str(["0", "2021-03-27", '""'])
assert ser.to_csv(index=False) == expected
ser = pd.Series(
pd.date_range(
start="2021-03-27", freq="D", periods=1, tz="Europe/Berlin"
).append(pd.DatetimeIndex([pd.NaT]))
)
ser = ser.astype("category")
assert ser.to_csv(index=False, date_format="%Y-%m-%d") == expected
def test_to_csv_float_ea_float_format(self):
# GH#45991
df = DataFrame({"a": [1.1, 2.02, pd.NA, 6.000006], "b": "c"})
df["a"] = df["a"].astype("Float64")
result = df.to_csv(index=False, float_format="%.5f")
expected = tm.convert_rows_list_to_csv_str(
["a,b", "1.10000,c", "2.02000,c", ",c", "6.00001,c"]
)
assert result == expected
def test_to_csv_float_ea_no_float_format(self):
# GH#45991
df = DataFrame({"a": [1.1, 2.02, pd.NA, 6.000006], "b": "c"})
df["a"] = df["a"].astype("Float64")
result = df.to_csv(index=False)
expected = tm.convert_rows_list_to_csv_str(
["a,b", "1.1,c", "2.02,c", ",c", "6.000006,c"]
)
assert result == expected
def test_to_csv_multi_index(self):
# see gh-6618
df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]))
exp_rows = [",1", ",2", "0,1"]
exp = tm.convert_rows_list_to_csv_str(exp_rows)
assert df.to_csv() == exp
exp_rows = ["1", "2", "1"]
exp = tm.convert_rows_list_to_csv_str(exp_rows)
assert df.to_csv(index=False) == exp
df = DataFrame(
[1],
columns=pd.MultiIndex.from_arrays([[1], [2]]),
index=pd.MultiIndex.from_arrays([[1], [2]]),
)
exp_rows = [",,1", ",,2", "1,2,1"]
exp = tm.convert_rows_list_to_csv_str(exp_rows)
assert df.to_csv() == exp
exp_rows = ["1", "2", "1"]
exp = tm.convert_rows_list_to_csv_str(exp_rows)
assert df.to_csv(index=False) == exp
df = DataFrame([1], columns=pd.MultiIndex.from_arrays([["foo"], ["bar"]]))
exp_rows = [",foo", ",bar", "0,1"]
exp = tm.convert_rows_list_to_csv_str(exp_rows)
assert df.to_csv() == exp
exp_rows = ["foo", "bar", "1"]
exp = tm.convert_rows_list_to_csv_str(exp_rows)
assert df.to_csv(index=False) == exp
@pytest.mark.parametrize(
"ind,expected",
[
(
pd.MultiIndex(levels=[[1.0]], codes=[[0]], names=["x"]),
"x,data\n1.0,1\n",
),
(
pd.MultiIndex(
levels=[[1.0], [2.0]], codes=[[0], [0]], names=["x", "y"]
),
"x,y,data\n1.0,2.0,1\n",
),
],
)
def test_to_csv_single_level_multi_index(self, ind, expected, frame_or_series):
# see gh-19589
obj = frame_or_series(pd.Series([1], ind, name="data"))
result = obj.to_csv(lineterminator="\n", header=True)
assert result == expected
def test_to_csv_string_array_ascii(self):
# GH 10813
str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}]
df = DataFrame(str_array)
expected_ascii = """\
,names
0,"['foo', 'bar']"
1,"['baz', 'qux']"
"""
with tm.ensure_clean("str_test.csv") as path:
df.to_csv(path, encoding="ascii")
with open(path, encoding="utf-8") as f:
assert f.read() == expected_ascii
def test_to_csv_string_array_utf8(self):
# GH 10813
str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}]
df = DataFrame(str_array)
expected_utf8 = """\
,names
0,"['foo', 'bar']"
1,"['baz', 'qux']"
"""
with tm.ensure_clean("unicode_test.csv") as path:
df.to_csv(path, encoding="utf-8")
with open(path, encoding="utf-8") as f:
assert f.read() == expected_utf8
def test_to_csv_string_with_lf(self):
# GH 20353
data = {"int": [1, 2, 3], "str_lf": ["abc", "d\nef", "g\nh\n\ni"]}
df = DataFrame(data)
with tm.ensure_clean("lf_test.csv") as path:
# case 1: The default line terminator(=os.linesep)(PR 21406)
os_linesep = os.linesep.encode("utf-8")
expected_noarg = (
b"int,str_lf"
+ os_linesep
+ b"1,abc"
+ os_linesep
+ b'2,"d\nef"'
+ os_linesep
+ b'3,"g\nh\n\ni"'
+ os_linesep
)
df.to_csv(path, index=False)
with open(path, "rb") as f:
assert f.read() == expected_noarg
with tm.ensure_clean("lf_test.csv") as path:
# case 2: LF as line terminator
expected_lf = b'int,str_lf\n1,abc\n2,"d\nef"\n3,"g\nh\n\ni"\n'
df.to_csv(path, lineterminator="\n", index=False)
with open(path, "rb") as f:
assert f.read() == expected_lf
with tm.ensure_clean("lf_test.csv") as path:
# case 3: CRLF as line terminator
# 'lineterminator' should not change inner element
expected_crlf = b'int,str_lf\r\n1,abc\r\n2,"d\nef"\r\n3,"g\nh\n\ni"\r\n'
df.to_csv(path, lineterminator="\r\n", index=False)
with open(path, "rb") as f:
assert f.read() == expected_crlf
def test_to_csv_string_with_crlf(self):
# GH 20353
data = {"int": [1, 2, 3], "str_crlf": ["abc", "d\r\nef", "g\r\nh\r\n\r\ni"]}
df = DataFrame(data)
with tm.ensure_clean("crlf_test.csv") as path:
# case 1: The default line terminator(=os.linesep)(PR 21406)
os_linesep = os.linesep.encode("utf-8")
expected_noarg = (
b"int,str_crlf"
+ os_linesep
+ b"1,abc"
+ os_linesep
+ b'2,"d\r\nef"'
+ os_linesep
+ b'3,"g\r\nh\r\n\r\ni"'
+ os_linesep
)
df.to_csv(path, index=False)
with open(path, "rb") as f:
assert f.read() == expected_noarg
with tm.ensure_clean("crlf_test.csv") as path:
# case 2: LF as line terminator
expected_lf = b'int,str_crlf\n1,abc\n2,"d\r\nef"\n3,"g\r\nh\r\n\r\ni"\n'
df.to_csv(path, lineterminator="\n", index=False)
with open(path, "rb") as f:
assert f.read() == expected_lf
with tm.ensure_clean("crlf_test.csv") as path:
# case 3: CRLF as line terminator
# 'lineterminator' should not change inner element
expected_crlf = (
b"int,str_crlf\r\n"
b"1,abc\r\n"
b'2,"d\r\nef"\r\n'
b'3,"g\r\nh\r\n\r\ni"\r\n'
)
df.to_csv(path, lineterminator="\r\n", index=False)
with open(path, "rb") as f:
assert f.read() == expected_crlf
def test_to_csv_stdout_file(self, capsys):
# GH 21561
df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["name_1", "name_2"])
expected_rows = [",name_1,name_2", "0,foo,bar", "1,baz,qux"]
expected_ascii = tm.convert_rows_list_to_csv_str(expected_rows)
df.to_csv(sys.stdout, encoding="ascii")
captured = capsys.readouterr()
assert captured.out == expected_ascii
assert not sys.stdout.closed
@pytest.mark.xfail(
compat.is_platform_windows(),
reason=(
"Especially in Windows, file stream should not be passed"
"to csv writer without newline='' option."
"(https://docs.python.org/3/library/csv.html#csv.writer)"
),
)
def test_to_csv_write_to_open_file(self):
# GH 21696
df = DataFrame({"a": ["x", "y", "z"]})
expected = """\
manual header
x
y
z
"""
with tm.ensure_clean("test.txt") as path:
with open(path, "w", encoding="utf-8") as f:
f.write("manual header\n")
df.to_csv(f, header=None, index=None)
with open(path, encoding="utf-8") as f:
assert f.read() == expected
def test_to_csv_write_to_open_file_with_newline_py3(self):
# see gh-21696
# see gh-20353
df = DataFrame({"a": ["x", "y", "z"]})
expected_rows = ["x", "y", "z"]
expected = "manual header\n" + tm.convert_rows_list_to_csv_str(expected_rows)
with tm.ensure_clean("test.txt") as path:
with open(path, "w", newline="", encoding="utf-8") as f:
f.write("manual header\n")
df.to_csv(f, header=None, index=None)
with open(path, "rb") as f:
assert f.read() == bytes(expected, "utf-8")
@pytest.mark.parametrize("to_infer", [True, False])
@pytest.mark.parametrize("read_infer", [True, False])
def test_to_csv_compression(
self, compression_only, read_infer, to_infer, compression_to_extension
):
# see gh-15008
compression = compression_only
# We'll complete file extension subsequently.
filename = "test."
filename += compression_to_extension[compression]
df = DataFrame({"A": [1]})
to_compression = "infer" if to_infer else compression
read_compression = "infer" if read_infer else compression
with tm.ensure_clean(filename) as path:
df.to_csv(path, compression=to_compression)
result = pd.read_csv(path, index_col=0, compression=read_compression)
tm.assert_frame_equal(result, df)
def test_to_csv_compression_dict(self, compression_only):
# GH 26023
method = compression_only
df = DataFrame({"ABC": [1]})
filename = "to_csv_compress_as_dict."
extension = {
"gzip": "gz",
"zstd": "zst",
}.get(method, method)
filename += extension
with tm.ensure_clean(filename) as path:
df.to_csv(path, compression={"method": method})
read_df = pd.read_csv(path, index_col=0)
tm.assert_frame_equal(read_df, df)
def test_to_csv_compression_dict_no_method_raises(self):
# GH 26023
df = DataFrame({"ABC": [1]})
compression = {"some_option": True}
msg = "must have key 'method'"
with tm.ensure_clean("out.zip") as path:
with pytest.raises(ValueError, match=msg):
df.to_csv(path, compression=compression)
@pytest.mark.parametrize("compression", ["zip", "infer"])
@pytest.mark.parametrize("archive_name", ["test_to_csv.csv", "test_to_csv.zip"])
def test_to_csv_zip_arguments(self, compression, archive_name):
# GH 26023
df = DataFrame({"ABC": [1]})
with tm.ensure_clean("to_csv_archive_name.zip") as path:
df.to_csv(
path, compression={"method": compression, "archive_name": archive_name}
)
with ZipFile(path) as zp:
assert len(zp.filelist) == 1
archived_file = zp.filelist[0].filename
assert archived_file == archive_name
@pytest.mark.parametrize(
"filename,expected_arcname",
[
("archive.csv", "archive.csv"),
("archive.tsv", "archive.tsv"),
("archive.csv.zip", "archive.csv"),
("archive.tsv.zip", "archive.tsv"),
("archive.zip", "archive"),
],
)
def test_to_csv_zip_infer_name(self, tmp_path, filename, expected_arcname):
# GH 39465
df = DataFrame({"ABC": [1]})
path = tmp_path / filename
df.to_csv(path, compression="zip")
with ZipFile(path) as zp:
assert len(zp.filelist) == 1
archived_file = zp.filelist[0].filename
assert archived_file == expected_arcname
@pytest.mark.parametrize("df_new_type", ["Int64"])
def test_to_csv_na_rep_long_string(self, df_new_type):
# see gh-25099
df = DataFrame({"c": [float("nan")] * 3})
df = df.astype(df_new_type)
expected_rows = ["c", "mynull", "mynull", "mynull"]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
result = df.to_csv(index=False, na_rep="mynull", encoding="ascii")
assert expected == result
def test_to_csv_timedelta_precision(self):
# GH 6783
s = pd.Series([1, 1]).astype("timedelta64[ns]")
buf = io.StringIO()
s.to_csv(buf)
result = buf.getvalue()
expected_rows = [
",0",
"0,0 days 00:00:00.000000001",
"1,0 days 00:00:00.000000001",
]
expected = tm.convert_rows_list_to_csv_str(expected_rows)
assert result == expected
def test_na_rep_truncated(self):
# https://github.com/pandas-dev/pandas/issues/31447
result = pd.Series(range(8, 12)).to_csv(na_rep="-")
expected = tm.convert_rows_list_to_csv_str([",0", "0,8", "1,9", "2,10", "3,11"])
assert result == expected
result = pd.Series([True, False]).to_csv(na_rep="nan")
expected = tm.convert_rows_list_to_csv_str([",0", "0,True", "1,False"])
assert result == expected
result = pd.Series([1.1, 2.2]).to_csv(na_rep=".")
expected = tm.convert_rows_list_to_csv_str([",0", "0,1.1", "1,2.2"])
assert result == expected
@pytest.mark.parametrize("errors", ["surrogatepass", "ignore", "replace"])
def test_to_csv_errors(self, errors):
# GH 22610
data = ["\ud800foo"]
ser = pd.Series(data, index=pd.Index(data))
with tm.ensure_clean("test.csv") as path:
ser.to_csv(path, errors=errors)
# No use in reading back the data as it is not the same anymore
# due to the error handling
@pytest.mark.parametrize("mode", ["wb", "w"])
def test_to_csv_binary_handle(self, mode):
"""
Binary file objects should work (if 'mode' contains a 'b') or even without
it in most cases.
GH 35058 and GH 19827
"""
df = tm.makeDataFrame()
with tm.ensure_clean() as path:
with open(path, mode="w+b") as handle:
df.to_csv(handle, mode=mode)
tm.assert_frame_equal(df, pd.read_csv(path, index_col=0))
@pytest.mark.parametrize("mode", ["wb", "w"])
def test_to_csv_encoding_binary_handle(self, mode):
"""
Binary file objects should honor a specified encoding.
GH 23854 and GH 13068 with binary handles
"""
# example from GH 23854
content = "a, b, 🐟".encode("utf-8-sig")
buffer = io.BytesIO(content)
df = pd.read_csv(buffer, encoding="utf-8-sig")
buffer = io.BytesIO()
df.to_csv(buffer, mode=mode, encoding="utf-8-sig", index=False)
buffer.seek(0) # tests whether file handle wasn't closed
assert buffer.getvalue().startswith(content)
# example from GH 13068
with tm.ensure_clean() as path:
with open(path, "w+b") as handle:
DataFrame().to_csv(handle, mode=mode, encoding="utf-8-sig")
handle.seek(0)
assert handle.read().startswith(b'\xef\xbb\xbf""')
def test_to_csv_iterative_compression_name(compression):
# GH 38714
df = tm.makeDataFrame()
with tm.ensure_clean() as path:
df.to_csv(path, compression=compression, chunksize=1)
tm.assert_frame_equal(
pd.read_csv(path, compression=compression, index_col=0), df
)
def test_to_csv_iterative_compression_buffer(compression):
# GH 38714
df = tm.makeDataFrame()
with io.BytesIO() as buffer:
df.to_csv(buffer, compression=compression, chunksize=1)
buffer.seek(0)
tm.assert_frame_equal(
pd.read_csv(buffer, compression=compression, index_col=0), df
)
assert not buffer.closed

View File

@@ -0,0 +1,429 @@
"""Tests formatting as writer-agnostic ExcelCells
ExcelFormatter is tested implicitly in pandas/tests/io/excel
"""
import string
import pytest
from pandas.errors import CSSWarning
import pandas._testing as tm
from pandas.io.formats.excel import (
CssExcelCell,
CSSToExcelConverter,
)
@pytest.mark.parametrize(
"css,expected",
[
# FONT
# - name
("font-family: foo,bar", {"font": {"name": "foo"}}),
('font-family: "foo bar",baz', {"font": {"name": "foo bar"}}),
("font-family: foo,\nbar", {"font": {"name": "foo"}}),
("font-family: foo, bar, baz", {"font": {"name": "foo"}}),
("font-family: bar, foo", {"font": {"name": "bar"}}),
("font-family: 'foo bar', baz", {"font": {"name": "foo bar"}}),
("font-family: 'foo \\'bar', baz", {"font": {"name": "foo 'bar"}}),
('font-family: "foo \\"bar", baz', {"font": {"name": 'foo "bar'}}),
('font-family: "foo ,bar", baz', {"font": {"name": "foo ,bar"}}),
# - family
("font-family: serif", {"font": {"name": "serif", "family": 1}}),
("font-family: Serif", {"font": {"name": "serif", "family": 1}}),
("font-family: roman, serif", {"font": {"name": "roman", "family": 1}}),
("font-family: roman, sans-serif", {"font": {"name": "roman", "family": 2}}),
("font-family: roman, sans serif", {"font": {"name": "roman"}}),
("font-family: roman, sansserif", {"font": {"name": "roman"}}),
("font-family: roman, cursive", {"font": {"name": "roman", "family": 4}}),
("font-family: roman, fantasy", {"font": {"name": "roman", "family": 5}}),
# - size
("font-size: 1em", {"font": {"size": 12}}),
("font-size: xx-small", {"font": {"size": 6}}),
("font-size: x-small", {"font": {"size": 7.5}}),
("font-size: small", {"font": {"size": 9.6}}),
("font-size: medium", {"font": {"size": 12}}),
("font-size: large", {"font": {"size": 13.5}}),
("font-size: x-large", {"font": {"size": 18}}),
("font-size: xx-large", {"font": {"size": 24}}),
("font-size: 50%", {"font": {"size": 6}}),
# - bold
("font-weight: 100", {"font": {"bold": False}}),
("font-weight: 200", {"font": {"bold": False}}),
("font-weight: 300", {"font": {"bold": False}}),
("font-weight: 400", {"font": {"bold": False}}),
("font-weight: normal", {"font": {"bold": False}}),
("font-weight: lighter", {"font": {"bold": False}}),
("font-weight: bold", {"font": {"bold": True}}),
("font-weight: bolder", {"font": {"bold": True}}),
("font-weight: 700", {"font": {"bold": True}}),
("font-weight: 800", {"font": {"bold": True}}),
("font-weight: 900", {"font": {"bold": True}}),
# - italic
("font-style: italic", {"font": {"italic": True}}),
("font-style: oblique", {"font": {"italic": True}}),
# - underline
("text-decoration: underline", {"font": {"underline": "single"}}),
("text-decoration: overline", {}),
("text-decoration: none", {}),
# - strike
("text-decoration: line-through", {"font": {"strike": True}}),
(
"text-decoration: underline line-through",
{"font": {"strike": True, "underline": "single"}},
),
(
"text-decoration: underline; text-decoration: line-through",
{"font": {"strike": True}},
),
# - color
("color: red", {"font": {"color": "FF0000"}}),
("color: #ff0000", {"font": {"color": "FF0000"}}),
("color: #f0a", {"font": {"color": "FF00AA"}}),
# - shadow
("text-shadow: none", {"font": {"shadow": False}}),
("text-shadow: 0px -0em 0px #CCC", {"font": {"shadow": False}}),
("text-shadow: 0px -0em 0px #999", {"font": {"shadow": False}}),
("text-shadow: 0px -0em 0px", {"font": {"shadow": False}}),
("text-shadow: 2px -0em 0px #CCC", {"font": {"shadow": True}}),
("text-shadow: 0px -2em 0px #CCC", {"font": {"shadow": True}}),
("text-shadow: 0px -0em 2px #CCC", {"font": {"shadow": True}}),
("text-shadow: 0px -0em 2px", {"font": {"shadow": True}}),
("text-shadow: 0px -2em", {"font": {"shadow": True}}),
# FILL
# - color, fillType
(
"background-color: red",
{"fill": {"fgColor": "FF0000", "patternType": "solid"}},
),
(
"background-color: #ff0000",
{"fill": {"fgColor": "FF0000", "patternType": "solid"}},
),
(
"background-color: #f0a",
{"fill": {"fgColor": "FF00AA", "patternType": "solid"}},
),
# BORDER
# - style
(
"border-style: solid",
{
"border": {
"top": {"style": "medium"},
"bottom": {"style": "medium"},
"left": {"style": "medium"},
"right": {"style": "medium"},
}
},
),
(
"border-style: solid; border-width: thin",
{
"border": {
"top": {"style": "thin"},
"bottom": {"style": "thin"},
"left": {"style": "thin"},
"right": {"style": "thin"},
}
},
),
(
"border-top-style: solid; border-top-width: thin",
{"border": {"top": {"style": "thin"}}},
),
(
"border-top-style: solid; border-top-width: 1pt",
{"border": {"top": {"style": "thin"}}},
),
("border-top-style: solid", {"border": {"top": {"style": "medium"}}}),
(
"border-top-style: solid; border-top-width: medium",
{"border": {"top": {"style": "medium"}}},
),
(
"border-top-style: solid; border-top-width: 2pt",
{"border": {"top": {"style": "medium"}}},
),
(
"border-top-style: solid; border-top-width: thick",
{"border": {"top": {"style": "thick"}}},
),
(
"border-top-style: solid; border-top-width: 4pt",
{"border": {"top": {"style": "thick"}}},
),
(
"border-top-style: dotted",
{"border": {"top": {"style": "mediumDashDotDot"}}},
),
(
"border-top-style: dotted; border-top-width: thin",
{"border": {"top": {"style": "dotted"}}},
),
("border-top-style: dashed", {"border": {"top": {"style": "mediumDashed"}}}),
(
"border-top-style: dashed; border-top-width: thin",
{"border": {"top": {"style": "dashed"}}},
),
("border-top-style: double", {"border": {"top": {"style": "double"}}}),
# - color
(
"border-style: solid; border-color: #0000ff",
{
"border": {
"top": {"style": "medium", "color": "0000FF"},
"right": {"style": "medium", "color": "0000FF"},
"bottom": {"style": "medium", "color": "0000FF"},
"left": {"style": "medium", "color": "0000FF"},
}
},
),
(
"border-top-style: double; border-top-color: blue",
{"border": {"top": {"style": "double", "color": "0000FF"}}},
),
(
"border-top-style: solid; border-top-color: #06c",
{"border": {"top": {"style": "medium", "color": "0066CC"}}},
),
(
"border-top-color: blue",
{"border": {"top": {"color": "0000FF", "style": "none"}}},
),
# ALIGNMENT
# - horizontal
("text-align: center", {"alignment": {"horizontal": "center"}}),
("text-align: left", {"alignment": {"horizontal": "left"}}),
("text-align: right", {"alignment": {"horizontal": "right"}}),
("text-align: justify", {"alignment": {"horizontal": "justify"}}),
# - vertical
("vertical-align: top", {"alignment": {"vertical": "top"}}),
("vertical-align: text-top", {"alignment": {"vertical": "top"}}),
("vertical-align: middle", {"alignment": {"vertical": "center"}}),
("vertical-align: bottom", {"alignment": {"vertical": "bottom"}}),
("vertical-align: text-bottom", {"alignment": {"vertical": "bottom"}}),
# - wrap_text
("white-space: nowrap", {"alignment": {"wrap_text": False}}),
("white-space: pre", {"alignment": {"wrap_text": False}}),
("white-space: pre-line", {"alignment": {"wrap_text": False}}),
("white-space: normal", {"alignment": {"wrap_text": True}}),
# NUMBER FORMAT
("number-format: 0%", {"number_format": {"format_code": "0%"}}),
(
"number-format: 0§[Red](0)§-§@;",
{"number_format": {"format_code": "0;[red](0);-;@"}}, # GH 46152
),
],
)
def test_css_to_excel(css, expected):
convert = CSSToExcelConverter()
assert expected == convert(css)
def test_css_to_excel_multiple():
convert = CSSToExcelConverter()
actual = convert(
"""
font-weight: bold;
text-decoration: underline;
color: red;
border-width: thin;
text-align: center;
vertical-align: top;
unused: something;
"""
)
assert {
"font": {"bold": True, "underline": "single", "color": "FF0000"},
"border": {
"top": {"style": "thin"},
"right": {"style": "thin"},
"bottom": {"style": "thin"},
"left": {"style": "thin"},
},
"alignment": {"horizontal": "center", "vertical": "top"},
} == actual
@pytest.mark.parametrize(
"css,inherited,expected",
[
("font-weight: bold", "", {"font": {"bold": True}}),
("", "font-weight: bold", {"font": {"bold": True}}),
(
"font-weight: bold",
"font-style: italic",
{"font": {"bold": True, "italic": True}},
),
("font-style: normal", "font-style: italic", {"font": {"italic": False}}),
("font-style: inherit", "", {}),
(
"font-style: normal; font-style: inherit",
"font-style: italic",
{"font": {"italic": True}},
),
],
)
def test_css_to_excel_inherited(css, inherited, expected):
convert = CSSToExcelConverter(inherited)
assert expected == convert(css)
@pytest.mark.parametrize(
"input_color,output_color",
(
list(CSSToExcelConverter.NAMED_COLORS.items())
+ [("#" + rgb, rgb) for rgb in CSSToExcelConverter.NAMED_COLORS.values()]
+ [("#F0F", "FF00FF"), ("#ABC", "AABBCC")]
),
)
def test_css_to_excel_good_colors(input_color, output_color):
# see gh-18392
css = (
f"border-top-color: {input_color}; "
f"border-right-color: {input_color}; "
f"border-bottom-color: {input_color}; "
f"border-left-color: {input_color}; "
f"background-color: {input_color}; "
f"color: {input_color}"
)
expected = {}
expected["fill"] = {"patternType": "solid", "fgColor": output_color}
expected["font"] = {"color": output_color}
expected["border"] = {
k: {"color": output_color, "style": "none"}
for k in ("top", "right", "bottom", "left")
}
with tm.assert_produces_warning(None):
convert = CSSToExcelConverter()
assert expected == convert(css)
@pytest.mark.parametrize("input_color", [None, "not-a-color"])
def test_css_to_excel_bad_colors(input_color):
# see gh-18392
css = (
f"border-top-color: {input_color}; "
f"border-right-color: {input_color}; "
f"border-bottom-color: {input_color}; "
f"border-left-color: {input_color}; "
f"background-color: {input_color}; "
f"color: {input_color}"
)
expected = {}
if input_color is not None:
expected["fill"] = {"patternType": "solid"}
with tm.assert_produces_warning(CSSWarning):
convert = CSSToExcelConverter()
assert expected == convert(css)
def tests_css_named_colors_valid():
upper_hexs = set(map(str.upper, string.hexdigits))
for color in CSSToExcelConverter.NAMED_COLORS.values():
assert len(color) == 6 and all(c in upper_hexs for c in color)
def test_css_named_colors_from_mpl_present():
mpl_colors = pytest.importorskip("matplotlib.colors")
pd_colors = CSSToExcelConverter.NAMED_COLORS
for name, color in mpl_colors.CSS4_COLORS.items():
assert name in pd_colors and pd_colors[name] == color[1:]
@pytest.mark.parametrize(
"styles,expected",
[
([("color", "green"), ("color", "red")], "color: red;"),
([("font-weight", "bold"), ("font-weight", "normal")], "font-weight: normal;"),
([("text-align", "center"), ("TEXT-ALIGN", "right")], "text-align: right;"),
],
)
def test_css_excel_cell_precedence(styles, expected):
"""It applies favors latter declarations over former declarations"""
# See GH 47371
converter = CSSToExcelConverter()
converter._call_cached.cache_clear()
css_styles = {(0, 0): styles}
cell = CssExcelCell(
row=0,
col=0,
val="",
style=None,
css_styles=css_styles,
css_row=0,
css_col=0,
css_converter=converter,
)
converter._call_cached.cache_clear()
assert cell.style == converter(expected)
@pytest.mark.parametrize(
"styles,cache_hits,cache_misses",
[
([[("color", "green"), ("color", "red"), ("color", "green")]], 0, 1),
(
[
[("font-weight", "bold")],
[("font-weight", "normal"), ("font-weight", "bold")],
],
1,
1,
),
([[("text-align", "center")], [("TEXT-ALIGN", "center")]], 1, 1),
(
[
[("font-weight", "bold"), ("text-align", "center")],
[("font-weight", "bold"), ("text-align", "left")],
],
0,
2,
),
(
[
[("font-weight", "bold"), ("text-align", "center")],
[("font-weight", "bold"), ("text-align", "left")],
[("font-weight", "bold"), ("text-align", "center")],
],
1,
2,
),
],
)
def test_css_excel_cell_cache(styles, cache_hits, cache_misses):
"""It caches unique cell styles"""
# See GH 47371
converter = CSSToExcelConverter()
converter._call_cached.cache_clear()
css_styles = {(0, i): _style for i, _style in enumerate(styles)}
for css_row, css_col in css_styles:
CssExcelCell(
row=0,
col=0,
val="",
style=None,
css_styles=css_styles,
css_row=css_row,
css_col=css_col,
css_converter=converter,
)
cache_info = converter._call_cached.cache_info()
converter._call_cached.cache_clear()
assert cache_info.hits == cache_hits
assert cache_info.misses == cache_misses

View File

@@ -0,0 +1,980 @@
from datetime import datetime
from io import StringIO
import re
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
option_context,
)
import pandas._testing as tm
import pandas.io.formats.format as fmt
lorem_ipsum = (
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod "
"tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim "
"veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex "
"ea commodo consequat. Duis aute irure dolor in reprehenderit in "
"voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur "
"sint occaecat cupidatat non proident, sunt in culpa qui officia "
"deserunt mollit anim id est laborum."
)
def expected_html(datapath, name):
"""
Read HTML file from formats data directory.
Parameters
----------
datapath : pytest fixture
The datapath fixture injected into a test by pytest.
name : str
The name of the HTML file without the suffix.
Returns
-------
str : contents of HTML file.
"""
filename = ".".join([name, "html"])
filepath = datapath("io", "formats", "data", "html", filename)
with open(filepath, encoding="utf-8") as f:
html = f.read()
return html.rstrip()
@pytest.fixture(params=["mixed", "empty"])
def biggie_df_fixture(request):
"""Fixture for a big mixed Dataframe and an empty Dataframe"""
if request.param == "mixed":
df = DataFrame(
{
"A": np.random.default_rng(2).standard_normal(200),
"B": tm.makeStringIndex(200),
},
index=np.arange(200),
)
df.loc[:20, "A"] = np.nan
df.loc[:20, "B"] = np.nan
return df
elif request.param == "empty":
df = DataFrame(index=np.arange(200))
return df
@pytest.fixture(params=fmt._VALID_JUSTIFY_PARAMETERS)
def justify(request):
return request.param
@pytest.mark.parametrize("col_space", [30, 50])
def test_to_html_with_col_space(col_space):
df = DataFrame(np.random.default_rng(2).random(size=(1, 3)))
# check that col_space affects HTML generation
# and be very brittle about it.
result = df.to_html(col_space=col_space)
hdrs = [x for x in result.split(r"\n") if re.search(r"<th[>\s]", x)]
assert len(hdrs) > 0
for h in hdrs:
assert "min-width" in h
assert str(col_space) in h
def test_to_html_with_column_specific_col_space_raises():
df = DataFrame(
np.random.default_rng(2).random(size=(3, 3)), columns=["a", "b", "c"]
)
msg = (
"Col_space length\\(\\d+\\) should match "
"DataFrame number of columns\\(\\d+\\)"
)
with pytest.raises(ValueError, match=msg):
df.to_html(col_space=[30, 40])
with pytest.raises(ValueError, match=msg):
df.to_html(col_space=[30, 40, 50, 60])
msg = "unknown column"
with pytest.raises(ValueError, match=msg):
df.to_html(col_space={"a": "foo", "b": 23, "d": 34})
def test_to_html_with_column_specific_col_space():
df = DataFrame(
np.random.default_rng(2).random(size=(3, 3)), columns=["a", "b", "c"]
)
result = df.to_html(col_space={"a": "2em", "b": 23})
hdrs = [x for x in result.split("\n") if re.search(r"<th[>\s]", x)]
assert 'min-width: 2em;">a</th>' in hdrs[1]
assert 'min-width: 23px;">b</th>' in hdrs[2]
assert "<th>c</th>" in hdrs[3]
result = df.to_html(col_space=["1em", 2, 3])
hdrs = [x for x in result.split("\n") if re.search(r"<th[>\s]", x)]
assert 'min-width: 1em;">a</th>' in hdrs[1]
assert 'min-width: 2px;">b</th>' in hdrs[2]
assert 'min-width: 3px;">c</th>' in hdrs[3]
def test_to_html_with_empty_string_label():
# GH 3547, to_html regards empty string labels as repeated labels
data = {"c1": ["a", "b"], "c2": ["a", ""], "data": [1, 2]}
df = DataFrame(data).set_index(["c1", "c2"])
result = df.to_html()
assert "rowspan" not in result
@pytest.mark.parametrize(
"df,expected",
[
(DataFrame({"\u03c3": np.arange(10.0)}), "unicode_1"),
(DataFrame({"A": ["\u03c3"]}), "unicode_2"),
],
)
def test_to_html_unicode(df, expected, datapath):
expected = expected_html(datapath, expected)
result = df.to_html()
assert result == expected
def test_to_html_encoding(float_frame, tmp_path):
# GH 28663
path = tmp_path / "test.html"
float_frame.to_html(path, encoding="gbk")
with open(str(path), encoding="gbk") as f:
assert float_frame.to_html() == f.read()
def test_to_html_decimal(datapath):
# GH 12031
df = DataFrame({"A": [6.0, 3.1, 2.2]})
result = df.to_html(decimal=",")
expected = expected_html(datapath, "gh12031_expected_output")
assert result == expected
@pytest.mark.parametrize(
"kwargs,string,expected",
[
({}, "<type 'str'>", "escaped"),
({"escape": False}, "<b>bold</b>", "escape_disabled"),
],
)
def test_to_html_escaped(kwargs, string, expected, datapath):
a = "str<ing1 &amp;"
b = "stri>ng2 &amp;"
test_dict = {"co<l1": {a: string, b: string}, "co>l2": {a: string, b: string}}
result = DataFrame(test_dict).to_html(**kwargs)
expected = expected_html(datapath, expected)
assert result == expected
@pytest.mark.parametrize("index_is_named", [True, False])
def test_to_html_multiindex_index_false(index_is_named, datapath):
# GH 8452
df = DataFrame(
{"a": range(2), "b": range(3, 5), "c": range(5, 7), "d": range(3, 5)}
)
df.columns = MultiIndex.from_product([["a", "b"], ["c", "d"]])
if index_is_named:
df.index = Index(df.index.values, name="idx")
result = df.to_html(index=False)
expected = expected_html(datapath, "gh8452_expected_output")
assert result == expected
@pytest.mark.parametrize(
"multi_sparse,expected",
[
(False, "multiindex_sparsify_false_multi_sparse_1"),
(False, "multiindex_sparsify_false_multi_sparse_2"),
(True, "multiindex_sparsify_1"),
(True, "multiindex_sparsify_2"),
],
)
def test_to_html_multiindex_sparsify(multi_sparse, expected, datapath):
index = MultiIndex.from_arrays([[0, 0, 1, 1], [0, 1, 0, 1]], names=["foo", None])
df = DataFrame([[0, 1], [2, 3], [4, 5], [6, 7]], index=index)
if expected.endswith("2"):
df.columns = index[::2]
with option_context("display.multi_sparse", multi_sparse):
result = df.to_html()
expected = expected_html(datapath, expected)
assert result == expected
@pytest.mark.parametrize(
"max_rows,expected",
[
(60, "gh14882_expected_output_1"),
# Test that ... appears in a middle level
(56, "gh14882_expected_output_2"),
],
)
def test_to_html_multiindex_odd_even_truncate(max_rows, expected, datapath):
# GH 14882 - Issue on truncation with odd length DataFrame
index = MultiIndex.from_product(
[[100, 200, 300], [10, 20, 30], [1, 2, 3, 4, 5, 6, 7]], names=["a", "b", "c"]
)
df = DataFrame({"n": range(len(index))}, index=index)
result = df.to_html(max_rows=max_rows)
expected = expected_html(datapath, expected)
assert result == expected
@pytest.mark.parametrize(
"df,formatters,expected",
[
(
DataFrame(
[[0, 1], [2, 3], [4, 5], [6, 7]],
columns=["foo", None],
index=np.arange(4),
),
{"__index__": lambda x: "abcd"[x]},
"index_formatter",
),
(
DataFrame({"months": [datetime(2016, 1, 1), datetime(2016, 2, 2)]}),
{"months": lambda x: x.strftime("%Y-%m")},
"datetime64_monthformatter",
),
(
DataFrame(
{
"hod": pd.to_datetime(
["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f"
)
}
),
{"hod": lambda x: x.strftime("%H:%M")},
"datetime64_hourformatter",
),
(
DataFrame(
{
"i": pd.Series([1, 2], dtype="int64"),
"f": pd.Series([1, 2], dtype="float64"),
"I": pd.Series([1, 2], dtype="Int64"),
"s": pd.Series([1, 2], dtype="string"),
"b": pd.Series([True, False], dtype="boolean"),
"c": pd.Series(["a", "b"], dtype=pd.CategoricalDtype(["a", "b"])),
"o": pd.Series([1, "2"], dtype=object),
}
),
[lambda x: "formatted"] * 7,
"various_dtypes_formatted",
),
],
)
def test_to_html_formatters(df, formatters, expected, datapath):
expected = expected_html(datapath, expected)
result = df.to_html(formatters=formatters)
assert result == expected
def test_to_html_regression_GH6098():
df = DataFrame(
{
"clé1": ["a", "a", "b", "b", "a"],
"clé2": ["1er", "2ème", "1er", "2ème", "1er"],
"données1": np.random.default_rng(2).standard_normal(5),
"données2": np.random.default_rng(2).standard_normal(5),
}
)
# it works
df.pivot_table(index=["clé1"], columns=["clé2"])._repr_html_()
def test_to_html_truncate(datapath):
index = pd.date_range(start="20010101", freq="D", periods=20)
df = DataFrame(index=index, columns=range(20))
result = df.to_html(max_rows=8, max_cols=4)
expected = expected_html(datapath, "truncate")
assert result == expected
@pytest.mark.parametrize("size", [1, 5])
def test_html_invalid_formatters_arg_raises(size):
# issue-28469
df = DataFrame(columns=["a", "b", "c"])
msg = "Formatters length({}) should match DataFrame number of columns(3)"
with pytest.raises(ValueError, match=re.escape(msg.format(size))):
df.to_html(formatters=["{}".format] * size)
def test_to_html_truncate_formatter(datapath):
# issue-25955
data = [
{"A": 1, "B": 2, "C": 3, "D": 4},
{"A": 5, "B": 6, "C": 7, "D": 8},
{"A": 9, "B": 10, "C": 11, "D": 12},
{"A": 13, "B": 14, "C": 15, "D": 16},
]
df = DataFrame(data)
fmt = lambda x: str(x) + "_mod"
formatters = [fmt, fmt, None, None]
result = df.to_html(formatters=formatters, max_cols=3)
expected = expected_html(datapath, "truncate_formatter")
assert result == expected
@pytest.mark.parametrize(
"sparsify,expected",
[(True, "truncate_multi_index"), (False, "truncate_multi_index_sparse_off")],
)
def test_to_html_truncate_multi_index(sparsify, expected, datapath):
arrays = [
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
["one", "two", "one", "two", "one", "two", "one", "two"],
]
df = DataFrame(index=arrays, columns=arrays)
result = df.to_html(max_rows=7, max_cols=7, sparsify=sparsify)
expected = expected_html(datapath, expected)
assert result == expected
@pytest.mark.parametrize(
"option,result,expected",
[
(None, lambda df: df.to_html(), "1"),
(None, lambda df: df.to_html(border=2), "2"),
(2, lambda df: df.to_html(), "2"),
(2, lambda df: df._repr_html_(), "2"),
],
)
def test_to_html_border(option, result, expected):
df = DataFrame({"A": [1, 2]})
if option is None:
result = result(df)
else:
with option_context("display.html.border", option):
result = result(df)
expected = f'border="{expected}"'
assert expected in result
@pytest.mark.parametrize("biggie_df_fixture", ["mixed"], indirect=True)
def test_to_html(biggie_df_fixture):
# TODO: split this test
df = biggie_df_fixture
s = df.to_html()
buf = StringIO()
retval = df.to_html(buf=buf)
assert retval is None
assert buf.getvalue() == s
assert isinstance(s, str)
df.to_html(columns=["B", "A"], col_space=17)
df.to_html(columns=["B", "A"], formatters={"A": lambda x: f"{x:.1f}"})
df.to_html(columns=["B", "A"], float_format=str)
df.to_html(columns=["B", "A"], col_space=12, float_format=str)
@pytest.mark.parametrize("biggie_df_fixture", ["empty"], indirect=True)
def test_to_html_empty_dataframe(biggie_df_fixture):
df = biggie_df_fixture
df.to_html()
def test_to_html_filename(biggie_df_fixture, tmpdir):
df = biggie_df_fixture
expected = df.to_html()
path = tmpdir.join("test.html")
df.to_html(path)
result = path.read()
assert result == expected
def test_to_html_with_no_bold():
df = DataFrame({"x": np.random.default_rng(2).standard_normal(5)})
html = df.to_html(bold_rows=False)
result = html[html.find("</thead>")]
assert "<strong" not in result
def test_to_html_columns_arg(float_frame):
result = float_frame.to_html(columns=["A"])
assert "<th>B</th>" not in result
@pytest.mark.parametrize(
"columns,justify,expected",
[
(
MultiIndex.from_tuples(
list(zip(np.arange(2).repeat(2), np.mod(range(4), 2))),
names=["CL0", "CL1"],
),
"left",
"multiindex_1",
),
(
MultiIndex.from_tuples(list(zip(range(4), np.mod(range(4), 2)))),
"right",
"multiindex_2",
),
],
)
def test_to_html_multiindex(columns, justify, expected, datapath):
df = DataFrame([list("abcd"), list("efgh")], columns=columns)
result = df.to_html(justify=justify)
expected = expected_html(datapath, expected)
assert result == expected
def test_to_html_justify(justify, datapath):
df = DataFrame(
{"A": [6, 30000, 2], "B": [1, 2, 70000], "C": [223442, 0, 1]},
columns=["A", "B", "C"],
)
result = df.to_html(justify=justify)
expected = expected_html(datapath, "justify").format(justify=justify)
assert result == expected
@pytest.mark.parametrize(
"justify", ["super-right", "small-left", "noinherit", "tiny", "pandas"]
)
def test_to_html_invalid_justify(justify):
# GH 17527
df = DataFrame()
msg = "Invalid value for justify parameter"
with pytest.raises(ValueError, match=msg):
df.to_html(justify=justify)
class TestHTMLIndex:
@pytest.fixture
def df(self):
index = ["foo", "bar", "baz"]
df = DataFrame(
{"A": [1, 2, 3], "B": [1.2, 3.4, 5.6], "C": ["one", "two", np.nan]},
columns=["A", "B", "C"],
index=index,
)
return df
@pytest.fixture
def expected_without_index(self, datapath):
return expected_html(datapath, "index_2")
def test_to_html_flat_index_without_name(
self, datapath, df, expected_without_index
):
expected_with_index = expected_html(datapath, "index_1")
assert df.to_html() == expected_with_index
result = df.to_html(index=False)
for i in df.index:
assert i not in result
assert result == expected_without_index
def test_to_html_flat_index_with_name(self, datapath, df, expected_without_index):
df.index = Index(["foo", "bar", "baz"], name="idx")
expected_with_index = expected_html(datapath, "index_3")
assert df.to_html() == expected_with_index
assert df.to_html(index=False) == expected_without_index
def test_to_html_multiindex_without_names(
self, datapath, df, expected_without_index
):
tuples = [("foo", "car"), ("foo", "bike"), ("bar", "car")]
df.index = MultiIndex.from_tuples(tuples)
expected_with_index = expected_html(datapath, "index_4")
assert df.to_html() == expected_with_index
result = df.to_html(index=False)
for i in ["foo", "bar", "car", "bike"]:
assert i not in result
# must be the same result as normal index
assert result == expected_without_index
def test_to_html_multiindex_with_names(self, datapath, df, expected_without_index):
tuples = [("foo", "car"), ("foo", "bike"), ("bar", "car")]
df.index = MultiIndex.from_tuples(tuples, names=["idx1", "idx2"])
expected_with_index = expected_html(datapath, "index_5")
assert df.to_html() == expected_with_index
assert df.to_html(index=False) == expected_without_index
@pytest.mark.parametrize("classes", ["sortable draggable", ["sortable", "draggable"]])
def test_to_html_with_classes(classes, datapath):
df = DataFrame()
expected = expected_html(datapath, "with_classes")
result = df.to_html(classes=classes)
assert result == expected
def test_to_html_no_index_max_rows(datapath):
# GH 14998
df = DataFrame({"A": [1, 2, 3, 4]})
result = df.to_html(index=False, max_rows=1)
expected = expected_html(datapath, "gh14998_expected_output")
assert result == expected
def test_to_html_multiindex_max_cols(datapath):
# GH 6131
index = MultiIndex(
levels=[["ba", "bb", "bc"], ["ca", "cb", "cc"]],
codes=[[0, 1, 2], [0, 1, 2]],
names=["b", "c"],
)
columns = MultiIndex(
levels=[["d"], ["aa", "ab", "ac"]],
codes=[[0, 0, 0], [0, 1, 2]],
names=[None, "a"],
)
data = np.array(
[[1.0, np.nan, np.nan], [np.nan, 2.0, np.nan], [np.nan, np.nan, 3.0]]
)
df = DataFrame(data, index, columns)
result = df.to_html(max_cols=2)
expected = expected_html(datapath, "gh6131_expected_output")
assert result == expected
def test_to_html_multi_indexes_index_false(datapath):
# GH 22579
df = DataFrame(
{"a": range(10), "b": range(10, 20), "c": range(10, 20), "d": range(10, 20)}
)
df.columns = MultiIndex.from_product([["a", "b"], ["c", "d"]])
df.index = MultiIndex.from_product([["a", "b"], ["c", "d", "e", "f", "g"]])
result = df.to_html(index=False)
expected = expected_html(datapath, "gh22579_expected_output")
assert result == expected
@pytest.mark.parametrize("index_names", [True, False])
@pytest.mark.parametrize("header", [True, False])
@pytest.mark.parametrize("index", [True, False])
@pytest.mark.parametrize(
"column_index, column_type",
[
(Index([0, 1]), "unnamed_standard"),
(Index([0, 1], name="columns.name"), "named_standard"),
(MultiIndex.from_product([["a"], ["b", "c"]]), "unnamed_multi"),
(
MultiIndex.from_product(
[["a"], ["b", "c"]], names=["columns.name.0", "columns.name.1"]
),
"named_multi",
),
],
)
@pytest.mark.parametrize(
"row_index, row_type",
[
(Index([0, 1]), "unnamed_standard"),
(Index([0, 1], name="index.name"), "named_standard"),
(MultiIndex.from_product([["a"], ["b", "c"]]), "unnamed_multi"),
(
MultiIndex.from_product(
[["a"], ["b", "c"]], names=["index.name.0", "index.name.1"]
),
"named_multi",
),
],
)
def test_to_html_basic_alignment(
datapath, row_index, row_type, column_index, column_type, index, header, index_names
):
# GH 22747, GH 22579
df = DataFrame(np.zeros((2, 2), dtype=int), index=row_index, columns=column_index)
result = df.to_html(index=index, header=header, index_names=index_names)
if not index:
row_type = "none"
elif not index_names and row_type.startswith("named"):
row_type = "un" + row_type
if not header:
column_type = "none"
elif not index_names and column_type.startswith("named"):
column_type = "un" + column_type
filename = "index_" + row_type + "_columns_" + column_type
expected = expected_html(datapath, filename)
assert result == expected
@pytest.mark.parametrize("index_names", [True, False])
@pytest.mark.parametrize("header", [True, False])
@pytest.mark.parametrize("index", [True, False])
@pytest.mark.parametrize(
"column_index, column_type",
[
(Index(np.arange(8)), "unnamed_standard"),
(Index(np.arange(8), name="columns.name"), "named_standard"),
(
MultiIndex.from_product([["a", "b"], ["c", "d"], ["e", "f"]]),
"unnamed_multi",
),
(
MultiIndex.from_product(
[["a", "b"], ["c", "d"], ["e", "f"]], names=["foo", None, "baz"]
),
"named_multi",
),
],
)
@pytest.mark.parametrize(
"row_index, row_type",
[
(Index(np.arange(8)), "unnamed_standard"),
(Index(np.arange(8), name="index.name"), "named_standard"),
(
MultiIndex.from_product([["a", "b"], ["c", "d"], ["e", "f"]]),
"unnamed_multi",
),
(
MultiIndex.from_product(
[["a", "b"], ["c", "d"], ["e", "f"]], names=["foo", None, "baz"]
),
"named_multi",
),
],
)
def test_to_html_alignment_with_truncation(
datapath, row_index, row_type, column_index, column_type, index, header, index_names
):
# GH 22747, GH 22579
df = DataFrame(np.arange(64).reshape(8, 8), index=row_index, columns=column_index)
result = df.to_html(
max_rows=4, max_cols=4, index=index, header=header, index_names=index_names
)
if not index:
row_type = "none"
elif not index_names and row_type.startswith("named"):
row_type = "un" + row_type
if not header:
column_type = "none"
elif not index_names and column_type.startswith("named"):
column_type = "un" + column_type
filename = "trunc_df_index_" + row_type + "_columns_" + column_type
expected = expected_html(datapath, filename)
assert result == expected
@pytest.mark.parametrize("index", [False, 0])
def test_to_html_truncation_index_false_max_rows(datapath, index):
# GH 15019
data = [
[1.764052, 0.400157],
[0.978738, 2.240893],
[1.867558, -0.977278],
[0.950088, -0.151357],
[-0.103219, 0.410599],
]
df = DataFrame(data)
result = df.to_html(max_rows=4, index=index)
expected = expected_html(datapath, "gh15019_expected_output")
assert result == expected
@pytest.mark.parametrize("index", [False, 0])
@pytest.mark.parametrize(
"col_index_named, expected_output",
[(False, "gh22783_expected_output"), (True, "gh22783_named_columns_index")],
)
def test_to_html_truncation_index_false_max_cols(
datapath, index, col_index_named, expected_output
):
# GH 22783
data = [
[1.764052, 0.400157, 0.978738, 2.240893, 1.867558],
[-0.977278, 0.950088, -0.151357, -0.103219, 0.410599],
]
df = DataFrame(data)
if col_index_named:
df.columns.rename("columns.name", inplace=True)
result = df.to_html(max_cols=4, index=index)
expected = expected_html(datapath, expected_output)
assert result == expected
@pytest.mark.parametrize("notebook", [True, False])
def test_to_html_notebook_has_style(notebook):
df = DataFrame({"A": [1, 2, 3]})
result = df.to_html(notebook=notebook)
if notebook:
assert "tbody tr th:only-of-type" in result
assert "vertical-align: middle;" in result
assert "thead th" in result
else:
assert "tbody tr th:only-of-type" not in result
assert "vertical-align: middle;" not in result
assert "thead th" not in result
def test_to_html_with_index_names_false():
# GH 16493
df = DataFrame({"A": [1, 2]}, index=Index(["a", "b"], name="myindexname"))
result = df.to_html(index_names=False)
assert "myindexname" not in result
def test_to_html_with_id():
# GH 8496
df = DataFrame({"A": [1, 2]}, index=Index(["a", "b"], name="myindexname"))
result = df.to_html(index_names=False, table_id="TEST_ID")
assert ' id="TEST_ID"' in result
@pytest.mark.parametrize(
"value,float_format,expected",
[
(0.19999, "%.3f", "gh21625_expected_output"),
(100.0, "%.0f", "gh22270_expected_output"),
],
)
def test_to_html_float_format_no_fixed_width(value, float_format, expected, datapath):
# GH 21625, GH 22270
df = DataFrame({"x": [value]})
expected = expected_html(datapath, expected)
result = df.to_html(float_format=float_format)
assert result == expected
@pytest.mark.parametrize(
"render_links,expected",
[(True, "render_links_true"), (False, "render_links_false")],
)
def test_to_html_render_links(render_links, expected, datapath):
# GH 2679
data = [
[0, "https://pandas.pydata.org/?q1=a&q2=b", "pydata.org"],
[0, "www.pydata.org", "pydata.org"],
]
df = DataFrame(data, columns=["foo", "bar", None])
result = df.to_html(render_links=render_links)
expected = expected_html(datapath, expected)
assert result == expected
@pytest.mark.parametrize(
"method,expected",
[
("to_html", lambda x: lorem_ipsum),
("_repr_html_", lambda x: lorem_ipsum[: x - 4] + "..."), # regression case
],
)
@pytest.mark.parametrize("max_colwidth", [10, 20, 50, 100])
def test_ignore_display_max_colwidth(method, expected, max_colwidth):
# see gh-17004
df = DataFrame([lorem_ipsum])
with option_context("display.max_colwidth", max_colwidth):
result = getattr(df, method)()
expected = expected(max_colwidth)
assert expected in result
@pytest.mark.parametrize("classes", [True, 0])
def test_to_html_invalid_classes_type(classes):
# GH 25608
df = DataFrame()
msg = "classes must be a string, list, or tuple"
with pytest.raises(TypeError, match=msg):
df.to_html(classes=classes)
def test_to_html_round_column_headers():
# GH 17280
df = DataFrame([1], columns=[0.55555])
with option_context("display.precision", 3):
html = df.to_html(notebook=False)
notebook = df.to_html(notebook=True)
assert "0.55555" in html
assert "0.556" in notebook
@pytest.mark.parametrize("unit", ["100px", "10%", "5em", 150])
def test_to_html_with_col_space_units(unit):
# GH 25941
df = DataFrame(np.random.default_rng(2).random(size=(1, 3)))
result = df.to_html(col_space=unit)
result = result.split("tbody")[0]
hdrs = [x for x in result.split("\n") if re.search(r"<th[>\s]", x)]
if isinstance(unit, int):
unit = str(unit) + "px"
for h in hdrs:
expected = f'<th style="min-width: {unit};">'
assert expected in h
def test_html_repr_min_rows_default(datapath):
# gh-27991
# default setting no truncation even if above min_rows
df = DataFrame({"a": range(20)})
result = df._repr_html_()
expected = expected_html(datapath, "html_repr_min_rows_default_no_truncation")
assert result == expected
# default of max_rows 60 triggers truncation if above
df = DataFrame({"a": range(61)})
result = df._repr_html_()
expected = expected_html(datapath, "html_repr_min_rows_default_truncated")
assert result == expected
@pytest.mark.parametrize(
"max_rows,min_rows,expected",
[
# truncated after first two rows
(10, 4, "html_repr_max_rows_10_min_rows_4"),
# when set to None, follow value of max_rows
(12, None, "html_repr_max_rows_12_min_rows_None"),
# when set value higher as max_rows, use the minimum
(10, 12, "html_repr_max_rows_10_min_rows_12"),
# max_rows of None -> never truncate
(None, 12, "html_repr_max_rows_None_min_rows_12"),
],
)
def test_html_repr_min_rows(datapath, max_rows, min_rows, expected):
# gh-27991
df = DataFrame({"a": range(61)})
expected = expected_html(datapath, expected)
with option_context("display.max_rows", max_rows, "display.min_rows", min_rows):
result = df._repr_html_()
assert result == expected
def test_to_html_multilevel(multiindex_year_month_day_dataframe_random_data):
ymd = multiindex_year_month_day_dataframe_random_data
ymd.columns.name = "foo"
ymd.to_html()
ymd.T.to_html()
@pytest.mark.parametrize("na_rep", ["NaN", "Ted"])
def test_to_html_na_rep_and_float_format(na_rep, datapath):
# https://github.com/pandas-dev/pandas/issues/13828
df = DataFrame(
[
["A", 1.2225],
["A", None],
],
columns=["Group", "Data"],
)
result = df.to_html(na_rep=na_rep, float_format="{:.2f}".format)
expected = expected_html(datapath, "gh13828_expected_output")
expected = expected.format(na_rep=na_rep)
assert result == expected
def test_to_html_na_rep_non_scalar_data(datapath):
# GH47103
df = DataFrame([{"a": 1, "b": [1, 2, 3]}])
result = df.to_html(na_rep="-")
expected = expected_html(datapath, "gh47103_expected_output")
assert result == expected
def test_to_html_float_format_object_col(datapath):
# GH#40024
df = DataFrame(data={"x": [1000.0, "test"]})
result = df.to_html(float_format=lambda x: f"{x:,.0f}")
expected = expected_html(datapath, "gh40024_expected_output")
assert result == expected
def test_to_html_multiindex_col_with_colspace():
# GH#53885
df = DataFrame([[1, 2]])
df.columns = MultiIndex.from_tuples([(1, 1), (2, 1)])
result = df.to_html(col_space=100)
expected = (
'<table border="1" class="dataframe">\n'
" <thead>\n"
" <tr>\n"
' <th style="min-width: 100px;"></th>\n'
' <th style="min-width: 100px;">1</th>\n'
' <th style="min-width: 100px;">2</th>\n'
" </tr>\n"
" <tr>\n"
' <th style="min-width: 100px;"></th>\n'
' <th style="min-width: 100px;">1</th>\n'
' <th style="min-width: 100px;">1</th>\n'
" </tr>\n"
" </thead>\n"
" <tbody>\n"
" <tr>\n"
" <th>0</th>\n"
" <td>1</td>\n"
" <td>2</td>\n"
" </tr>\n"
" </tbody>\n"
"</table>"
)
assert result == expected
def test_to_html_tuple_col_with_colspace():
# GH#53885
df = DataFrame({("a", "b"): [1], "b": [2]})
result = df.to_html(col_space=100)
expected = (
'<table border="1" class="dataframe">\n'
" <thead>\n"
' <tr style="text-align: right;">\n'
' <th style="min-width: 100px;"></th>\n'
' <th style="min-width: 100px;">(a, b)</th>\n'
' <th style="min-width: 100px;">b</th>\n'
" </tr>\n"
" </thead>\n"
" <tbody>\n"
" <tr>\n"
" <th>0</th>\n"
" <td>1</td>\n"
" <td>2</td>\n"
" </tr>\n"
" </tbody>\n"
"</table>"
)
assert result == expected
def test_to_html_empty_complex_array():
# GH#54167
df = DataFrame({"x": np.array([], dtype="complex")})
result = df.to_html(col_space=100)
expected = (
'<table border="1" class="dataframe">\n'
" <thead>\n"
' <tr style="text-align: right;">\n'
' <th style="min-width: 100px;"></th>\n'
' <th style="min-width: 100px;">x</th>\n'
" </tr>\n"
" </thead>\n"
" <tbody>\n"
" </tbody>\n"
"</table>"
)
assert result == expected

View File

@@ -0,0 +1,90 @@
from io import StringIO
import pytest
import pandas as pd
pytest.importorskip("tabulate")
def test_simple():
buf = StringIO()
df = pd.DataFrame([1, 2, 3])
df.to_markdown(buf=buf)
result = buf.getvalue()
assert (
result == "| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |"
)
def test_empty_frame():
buf = StringIO()
df = pd.DataFrame({"id": [], "first_name": [], "last_name": []}).set_index("id")
df.to_markdown(buf=buf)
result = buf.getvalue()
assert result == (
"| id | first_name | last_name |\n"
"|------|--------------|-------------|"
)
def test_other_tablefmt():
buf = StringIO()
df = pd.DataFrame([1, 2, 3])
df.to_markdown(buf=buf, tablefmt="jira")
result = buf.getvalue()
assert result == "|| || 0 ||\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |"
def test_other_headers():
buf = StringIO()
df = pd.DataFrame([1, 2, 3])
df.to_markdown(buf=buf, headers=["foo", "bar"])
result = buf.getvalue()
assert result == (
"| foo | bar |\n|------:|------:|\n| 0 "
"| 1 |\n| 1 | 2 |\n| 2 | 3 |"
)
def test_series():
buf = StringIO()
s = pd.Series([1, 2, 3], name="foo")
s.to_markdown(buf=buf)
result = buf.getvalue()
assert result == (
"| | foo |\n|---:|------:|\n| 0 | 1 "
"|\n| 1 | 2 |\n| 2 | 3 |"
)
def test_no_buf():
df = pd.DataFrame([1, 2, 3])
result = df.to_markdown()
assert (
result == "| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |"
)
@pytest.mark.parametrize("index", [True, False])
def test_index(index):
# GH 32667
df = pd.DataFrame([1, 2, 3])
result = df.to_markdown(index=index)
if index:
expected = (
"| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |"
)
else:
expected = "| 0 |\n|----:|\n| 1 |\n| 2 |\n| 3 |"
assert result == expected
def test_showindex_disallowed_in_kwargs():
# GH 32667; disallowing showindex in kwargs enforced in 2.0
df = pd.DataFrame([1, 2, 3])
with pytest.raises(ValueError, match="Pass 'index' instead of 'showindex"):
df.to_markdown(index=True, showindex=True)

View File

@@ -0,0 +1,357 @@
from datetime import datetime
from io import StringIO
from textwrap import dedent
import numpy as np
import pytest
from pandas import (
DataFrame,
Series,
option_context,
to_datetime,
)
def test_repr_embedded_ndarray():
arr = np.empty(10, dtype=[("err", object)])
for i in range(len(arr)):
arr["err"][i] = np.random.default_rng(2).standard_normal(i)
df = DataFrame(arr)
repr(df["err"])
repr(df)
df.to_string()
def test_repr_tuples():
buf = StringIO()
df = DataFrame({"tups": list(zip(range(10), range(10)))})
repr(df)
df.to_string(col_space=10, buf=buf)
def test_to_string_truncate():
# GH 9784 - dont truncate when calling DataFrame.to_string
df = DataFrame(
[
{
"a": "foo",
"b": "bar",
"c": "let's make this a very VERY long line that is longer "
"than the default 50 character limit",
"d": 1,
},
{"a": "foo", "b": "bar", "c": "stuff", "d": 1},
]
)
df.set_index(["a", "b", "c"])
assert df.to_string() == (
" a b "
" c d\n"
"0 foo bar let's make this a very VERY long line t"
"hat is longer than the default 50 character limit 1\n"
"1 foo bar "
" stuff 1"
)
with option_context("max_colwidth", 20):
# the display option has no effect on the to_string method
assert df.to_string() == (
" a b "
" c d\n"
"0 foo bar let's make this a very VERY long line t"
"hat is longer than the default 50 character limit 1\n"
"1 foo bar "
" stuff 1"
)
assert df.to_string(max_colwidth=20) == (
" a b c d\n"
"0 foo bar let's make this ... 1\n"
"1 foo bar stuff 1"
)
@pytest.mark.parametrize(
"input_array, expected",
[
("a", "a"),
(["a", "b"], "a\nb"),
([1, "a"], "1\na"),
(1, "1"),
([0, -1], " 0\n-1"),
(1.0, "1.0"),
([" a", " b"], " a\n b"),
([".1", "1"], ".1\n 1"),
(["10", "-10"], " 10\n-10"),
],
)
def test_format_remove_leading_space_series(input_array, expected):
# GH: 24980
s = Series(input_array).to_string(index=False)
assert s == expected
@pytest.mark.parametrize(
"input_array, expected",
[
({"A": ["a"]}, "A\na"),
({"A": ["a", "b"], "B": ["c", "dd"]}, "A B\na c\nb dd"),
({"A": ["a", 1], "B": ["aa", 1]}, "A B\na aa\n1 1"),
],
)
def test_format_remove_leading_space_dataframe(input_array, expected):
# GH: 24980
df = DataFrame(input_array).to_string(index=False)
assert df == expected
@pytest.mark.parametrize(
"max_cols, max_rows, expected",
[
(
10,
None,
" 0 1 2 3 4 ... 6 7 8 9 10\n"
" 0 0 0 0 0 ... 0 0 0 0 0\n"
" 0 0 0 0 0 ... 0 0 0 0 0\n"
" 0 0 0 0 0 ... 0 0 0 0 0\n"
" 0 0 0 0 0 ... 0 0 0 0 0",
),
(
None,
2,
" 0 1 2 3 4 5 6 7 8 9 10\n"
" 0 0 0 0 0 0 0 0 0 0 0\n"
" .. .. .. .. .. .. .. .. .. .. ..\n"
" 0 0 0 0 0 0 0 0 0 0 0",
),
(
10,
2,
" 0 1 2 3 4 ... 6 7 8 9 10\n"
" 0 0 0 0 0 ... 0 0 0 0 0\n"
" .. .. .. .. .. ... .. .. .. .. ..\n"
" 0 0 0 0 0 ... 0 0 0 0 0",
),
(
9,
2,
" 0 1 2 3 ... 7 8 9 10\n"
" 0 0 0 0 ... 0 0 0 0\n"
" .. .. .. .. ... .. .. .. ..\n"
" 0 0 0 0 ... 0 0 0 0",
),
(
1,
1,
" 0 ...\n 0 ...\n.. ...",
),
],
)
def test_truncation_no_index(max_cols, max_rows, expected):
df = DataFrame([[0] * 11] * 4)
assert df.to_string(index=False, max_cols=max_cols, max_rows=max_rows) == expected
def test_to_string_unicode_columns(float_frame):
df = DataFrame({"\u03c3": np.arange(10.0)})
buf = StringIO()
df.to_string(buf=buf)
buf.getvalue()
buf = StringIO()
df.info(buf=buf)
buf.getvalue()
result = float_frame.to_string()
assert isinstance(result, str)
def test_to_string_utf8_columns():
n = "\u05d0".encode()
with option_context("display.max_rows", 1):
df = DataFrame([1, 2], columns=[n])
repr(df)
def test_to_string_unicode_two():
dm = DataFrame({"c/\u03c3": []})
buf = StringIO()
dm.to_string(buf)
def test_to_string_unicode_three():
dm = DataFrame(["\xc2"])
buf = StringIO()
dm.to_string(buf)
def test_to_string_with_formatters():
df = DataFrame(
{
"int": [1, 2, 3],
"float": [1.0, 2.0, 3.0],
"object": [(1, 2), True, False],
},
columns=["int", "float", "object"],
)
formatters = [
("int", lambda x: f"0x{x:x}"),
("float", lambda x: f"[{x: 4.1f}]"),
("object", lambda x: f"-{x!s}-"),
]
result = df.to_string(formatters=dict(formatters))
result2 = df.to_string(formatters=list(zip(*formatters))[1])
assert result == (
" int float object\n"
"0 0x1 [ 1.0] -(1, 2)-\n"
"1 0x2 [ 2.0] -True-\n"
"2 0x3 [ 3.0] -False-"
)
assert result == result2
def test_to_string_with_datetime64_monthformatter():
months = [datetime(2016, 1, 1), datetime(2016, 2, 2)]
x = DataFrame({"months": months})
def format_func(x):
return x.strftime("%Y-%m")
result = x.to_string(formatters={"months": format_func})
expected = dedent(
"""\
months
0 2016-01
1 2016-02"""
)
assert result.strip() == expected
def test_to_string_with_datetime64_hourformatter():
x = DataFrame(
{"hod": to_datetime(["10:10:10.100", "12:12:12.120"], format="%H:%M:%S.%f")}
)
def format_func(x):
return x.strftime("%H:%M")
result = x.to_string(formatters={"hod": format_func})
expected = dedent(
"""\
hod
0 10:10
1 12:12"""
)
assert result.strip() == expected
def test_to_string_with_formatters_unicode():
df = DataFrame({"c/\u03c3": [1, 2, 3]})
result = df.to_string(formatters={"c/\u03c3": str})
expected = dedent(
"""\
c/\u03c3
0 1
1 2
2 3"""
)
assert result == expected
def test_to_string_complex_number_trims_zeros():
s = Series([1.000000 + 1.000000j, 1.0 + 1.0j, 1.05 + 1.0j])
result = s.to_string()
expected = dedent(
"""\
0 1.00+1.00j
1 1.00+1.00j
2 1.05+1.00j"""
)
assert result == expected
def test_nullable_float_to_string(float_ea_dtype):
# https://github.com/pandas-dev/pandas/issues/36775
dtype = float_ea_dtype
s = Series([0.0, 1.0, None], dtype=dtype)
result = s.to_string()
expected = dedent(
"""\
0 0.0
1 1.0
2 <NA>"""
)
assert result == expected
def test_nullable_int_to_string(any_int_ea_dtype):
# https://github.com/pandas-dev/pandas/issues/36775
dtype = any_int_ea_dtype
s = Series([0, 1, None], dtype=dtype)
result = s.to_string()
expected = dedent(
"""\
0 0
1 1
2 <NA>"""
)
assert result == expected
@pytest.mark.parametrize("na_rep", ["NaN", "Ted"])
def test_to_string_na_rep_and_float_format(na_rep):
# GH 13828
df = DataFrame([["A", 1.2225], ["A", None]], columns=["Group", "Data"])
result = df.to_string(na_rep=na_rep, float_format="{:.2f}".format)
expected = dedent(
f"""\
Group Data
0 A 1.22
1 A {na_rep}"""
)
assert result == expected
@pytest.mark.parametrize(
"data,expected",
[
(
{"col1": [1, 2], "col2": [3, 4]},
" col1 col2\n0 1 3\n1 2 4",
),
(
{"col1": ["Abc", 0.756], "col2": [np.nan, 4.5435]},
" col1 col2\n0 Abc NaN\n1 0.756 4.5435",
),
(
{"col1": [np.nan, "a"], "col2": [0.009, 3.543], "col3": ["Abc", 23]},
" col1 col2 col3\n0 NaN 0.009 Abc\n1 a 3.543 23",
),
],
)
def test_to_string_max_rows_zero(data, expected):
# GH35394
result = DataFrame(data=data).to_string(max_rows=0)
assert result == expected
def test_to_string_string_dtype():
# GH#50099
pytest.importorskip("pyarrow")
df = DataFrame({"x": ["foo", "bar", "baz"], "y": ["a", "b", "c"], "z": [1, 2, 3]})
df = df.astype(
{"x": "string[pyarrow]", "y": "string[python]", "z": "int64[pyarrow]"}
)
result = df.dtypes.to_string()
expected = dedent(
"""\
x string[pyarrow]
y string[python]
z int64[pyarrow]"""
)
assert result == expected

View File

@@ -0,0 +1,348 @@
"""
self-contained to write legacy storage pickle files
To use this script. Create an environment where you want
generate pickles, say its for 0.20.3, with your pandas clone
in ~/pandas
. activate pandas_0.20.3
cd ~/pandas/pandas
$ python -m tests.io.generate_legacy_storage_files \
tests/io/data/legacy_pickle/0.20.3/ pickle
This script generates a storage file for the current arch, system,
and python version
pandas version: 0.20.3
output dir : pandas/pandas/tests/io/data/legacy_pickle/0.20.3/
storage format: pickle
created pickle file: 0.20.3_x86_64_darwin_3.5.2.pickle
The idea here is you are using the *current* version of the
generate_legacy_storage_files with an *older* version of pandas to
generate a pickle file. We will then check this file into a current
branch, and test using test_pickle.py. This will load the *older*
pickles and test versus the current data that is generated
(with main). These are then compared.
If we have cases where we changed the signature (e.g. we renamed
offset -> freq in Timestamp). Then we have to conditionally execute
in the generate_legacy_storage_files.py to make it
run under the older AND the newer version.
"""
from datetime import timedelta
import os
import pickle
import platform as pl
import sys
# Remove script directory from path, otherwise Python will try to
# import the JSON test directory as the json module
sys.path.pop(0)
import numpy as np
import pandas
from pandas import (
Categorical,
DataFrame,
Index,
MultiIndex,
NaT,
Period,
RangeIndex,
Series,
Timestamp,
bdate_range,
date_range,
interval_range,
period_range,
timedelta_range,
)
from pandas.arrays import SparseArray
from pandas.tseries.offsets import (
FY5253,
BusinessDay,
BusinessHour,
CustomBusinessDay,
DateOffset,
Day,
Easter,
Hour,
LastWeekOfMonth,
Minute,
MonthBegin,
MonthEnd,
QuarterBegin,
QuarterEnd,
SemiMonthBegin,
SemiMonthEnd,
Week,
WeekOfMonth,
YearBegin,
YearEnd,
)
def _create_sp_series():
nan = np.nan
# nan-based
arr = np.arange(15, dtype=np.float64)
arr[7:12] = nan
arr[-1:] = nan
bseries = Series(SparseArray(arr, kind="block"))
bseries.name = "bseries"
return bseries
def _create_sp_tsseries():
nan = np.nan
# nan-based
arr = np.arange(15, dtype=np.float64)
arr[7:12] = nan
arr[-1:] = nan
date_index = bdate_range("1/1/2011", periods=len(arr))
bseries = Series(SparseArray(arr, kind="block"), index=date_index)
bseries.name = "btsseries"
return bseries
def _create_sp_frame():
nan = np.nan
data = {
"A": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
"B": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
"C": np.arange(10).astype(np.int64),
"D": [0, 1, 2, 3, 4, 5, nan, nan, nan, nan],
}
dates = bdate_range("1/1/2011", periods=10)
return DataFrame(data, index=dates).apply(SparseArray)
def create_data():
"""create the pickle data"""
data = {
"A": [0.0, 1.0, 2.0, 3.0, np.nan],
"B": [0, 1, 0, 1, 0],
"C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
"D": date_range("1/1/2009", periods=5),
"E": [0.0, 1, Timestamp("20100101"), "foo", 2.0],
}
scalars = {"timestamp": Timestamp("20130101"), "period": Period("2012", "M")}
index = {
"int": Index(np.arange(10)),
"date": date_range("20130101", periods=10),
"period": period_range("2013-01-01", freq="M", periods=10),
"float": Index(np.arange(10, dtype=np.float64)),
"uint": Index(np.arange(10, dtype=np.uint64)),
"timedelta": timedelta_range("00:00:00", freq="30T", periods=10),
}
index["range"] = RangeIndex(10)
index["interval"] = interval_range(0, periods=10)
mi = {
"reg2": MultiIndex.from_tuples(
tuple(
zip(
*[
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
["one", "two", "one", "two", "one", "two", "one", "two"],
]
)
),
names=["first", "second"],
)
}
series = {
"float": Series(data["A"]),
"int": Series(data["B"]),
"mixed": Series(data["E"]),
"ts": Series(
np.arange(10).astype(np.int64), index=date_range("20130101", periods=10)
),
"mi": Series(
np.arange(5).astype(np.float64),
index=MultiIndex.from_tuples(
tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=["one", "two"]
),
),
"dup": Series(np.arange(5).astype(np.float64), index=["A", "B", "C", "D", "A"]),
"cat": Series(Categorical(["foo", "bar", "baz"])),
"dt": Series(date_range("20130101", periods=5)),
"dt_tz": Series(date_range("20130101", periods=5, tz="US/Eastern")),
"period": Series([Period("2000Q1")] * 5),
}
mixed_dup_df = DataFrame(data)
mixed_dup_df.columns = list("ABCDA")
frame = {
"float": DataFrame({"A": series["float"], "B": series["float"] + 1}),
"int": DataFrame({"A": series["int"], "B": series["int"] + 1}),
"mixed": DataFrame({k: data[k] for k in ["A", "B", "C", "D"]}),
"mi": DataFrame(
{"A": np.arange(5).astype(np.float64), "B": np.arange(5).astype(np.int64)},
index=MultiIndex.from_tuples(
tuple(
zip(
*[
["bar", "bar", "baz", "baz", "baz"],
["one", "two", "one", "two", "three"],
]
)
),
names=["first", "second"],
),
),
"dup": DataFrame(
np.arange(15).reshape(5, 3).astype(np.float64), columns=["A", "B", "A"]
),
"cat_onecol": DataFrame({"A": Categorical(["foo", "bar"])}),
"cat_and_float": DataFrame(
{
"A": Categorical(["foo", "bar", "baz"]),
"B": np.arange(3).astype(np.int64),
}
),
"mixed_dup": mixed_dup_df,
"dt_mixed_tzs": DataFrame(
{
"A": Timestamp("20130102", tz="US/Eastern"),
"B": Timestamp("20130603", tz="CET"),
},
index=range(5),
),
"dt_mixed2_tzs": DataFrame(
{
"A": Timestamp("20130102", tz="US/Eastern"),
"B": Timestamp("20130603", tz="CET"),
"C": Timestamp("20130603", tz="UTC"),
},
index=range(5),
),
}
cat = {
"int8": Categorical(list("abcdefg")),
"int16": Categorical(np.arange(1000)),
"int32": Categorical(np.arange(10000)),
}
timestamp = {
"normal": Timestamp("2011-01-01"),
"nat": NaT,
"tz": Timestamp("2011-01-01", tz="US/Eastern"),
}
off = {
"DateOffset": DateOffset(years=1),
"DateOffset_h_ns": DateOffset(hour=6, nanoseconds=5824),
"BusinessDay": BusinessDay(offset=timedelta(seconds=9)),
"BusinessHour": BusinessHour(normalize=True, n=6, end="15:14"),
"CustomBusinessDay": CustomBusinessDay(weekmask="Mon Fri"),
"SemiMonthBegin": SemiMonthBegin(day_of_month=9),
"SemiMonthEnd": SemiMonthEnd(day_of_month=24),
"MonthBegin": MonthBegin(1),
"MonthEnd": MonthEnd(1),
"QuarterBegin": QuarterBegin(1),
"QuarterEnd": QuarterEnd(1),
"Day": Day(1),
"YearBegin": YearBegin(1),
"YearEnd": YearEnd(1),
"Week": Week(1),
"Week_Tues": Week(2, normalize=False, weekday=1),
"WeekOfMonth": WeekOfMonth(week=3, weekday=4),
"LastWeekOfMonth": LastWeekOfMonth(n=1, weekday=3),
"FY5253": FY5253(n=2, weekday=6, startingMonth=7, variation="last"),
"Easter": Easter(),
"Hour": Hour(1),
"Minute": Minute(1),
}
return {
"series": series,
"frame": frame,
"index": index,
"scalars": scalars,
"mi": mi,
"sp_series": {"float": _create_sp_series(), "ts": _create_sp_tsseries()},
"sp_frame": {"float": _create_sp_frame()},
"cat": cat,
"timestamp": timestamp,
"offsets": off,
}
def create_pickle_data():
data = create_data()
return data
def platform_name():
return "_".join(
[
str(pandas.__version__),
str(pl.machine()),
str(pl.system().lower()),
str(pl.python_version()),
]
)
def write_legacy_pickles(output_dir):
version = pandas.__version__
print(
"This script generates a storage file for the current arch, system, "
"and python version"
)
print(f" pandas version: {version}")
print(f" output dir : {output_dir}")
print(" storage format: pickle")
pth = f"{platform_name()}.pickle"
with open(os.path.join(output_dir, pth), "wb") as fh:
pickle.dump(create_pickle_data(), fh, pickle.DEFAULT_PROTOCOL)
print(f"created pickle file: {pth}")
def write_legacy_file():
# force our cwd to be the first searched
sys.path.insert(0, "")
if not 3 <= len(sys.argv) <= 4:
sys.exit(
"Specify output directory and storage type: generate_legacy_"
"storage_files.py <output_dir> <storage_type> "
)
output_dir = str(sys.argv[1])
storage_type = str(sys.argv[2])
if not os.path.exists(output_dir):
os.mkdir(output_dir)
if storage_type == "pickle":
write_legacy_pickles(output_dir=output_dir)
else:
sys.exit("storage_type must be one of {'pickle'}")
if __name__ == "__main__":
write_legacy_file()

View File

@@ -0,0 +1,16 @@
import pytest
@pytest.fixture(params=["split", "records", "index", "columns", "values"])
def orient(request):
"""
Fixture for orients excluding the table format.
"""
return request.param
@pytest.fixture(params=["ujson", "pyarrow"])
def engine(request):
if request.param == "pyarrow":
pytest.importorskip("pyarrow.json")
return request.param

View File

@@ -0,0 +1,126 @@
from io import (
BytesIO,
StringIO,
)
import pytest
import pandas.util._test_decorators as td
import pandas as pd
import pandas._testing as tm
def test_compression_roundtrip(compression):
df = pd.DataFrame(
[[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
index=["A", "B"],
columns=["X", "Y", "Z"],
)
with tm.ensure_clean() as path:
df.to_json(path, compression=compression)
tm.assert_frame_equal(df, pd.read_json(path, compression=compression))
# explicitly ensure file was compressed.
with tm.decompress_file(path, compression) as fh:
result = fh.read().decode("utf8")
data = StringIO(result)
tm.assert_frame_equal(df, pd.read_json(data))
def test_read_zipped_json(datapath):
uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json")
uncompressed_df = pd.read_json(uncompressed_path)
compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip")
compressed_df = pd.read_json(compressed_path, compression="zip")
tm.assert_frame_equal(uncompressed_df, compressed_df)
@td.skip_if_not_us_locale
@pytest.mark.single_cpu
def test_with_s3_url(compression, s3_public_bucket, s3so):
# Bucket created in tests/io/conftest.py
df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
with tm.ensure_clean() as path:
df.to_json(path, compression=compression)
with open(path, "rb") as f:
s3_public_bucket.put_object(Key="test-1", Body=f)
roundtripped_df = pd.read_json(
f"s3://{s3_public_bucket.name}/test-1",
compression=compression,
storage_options=s3so,
)
tm.assert_frame_equal(df, roundtripped_df)
def test_lines_with_compression(compression):
with tm.ensure_clean() as path:
df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
df.to_json(path, orient="records", lines=True, compression=compression)
roundtripped_df = pd.read_json(path, lines=True, compression=compression)
tm.assert_frame_equal(df, roundtripped_df)
def test_chunksize_with_compression(compression):
with tm.ensure_clean() as path:
df = pd.read_json(StringIO('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}'))
df.to_json(path, orient="records", lines=True, compression=compression)
with pd.read_json(
path, lines=True, chunksize=1, compression=compression
) as res:
roundtripped_df = pd.concat(res)
tm.assert_frame_equal(df, roundtripped_df)
def test_write_unsupported_compression_type():
df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
with tm.ensure_clean() as path:
msg = "Unrecognized compression type: unsupported"
with pytest.raises(ValueError, match=msg):
df.to_json(path, compression="unsupported")
def test_read_unsupported_compression_type():
with tm.ensure_clean() as path:
msg = "Unrecognized compression type: unsupported"
with pytest.raises(ValueError, match=msg):
pd.read_json(path, compression="unsupported")
@pytest.mark.parametrize("to_infer", [True, False])
@pytest.mark.parametrize("read_infer", [True, False])
def test_to_json_compression(
compression_only, read_infer, to_infer, compression_to_extension
):
# see gh-15008
compression = compression_only
# We'll complete file extension subsequently.
filename = "test."
filename += compression_to_extension[compression]
df = pd.DataFrame({"A": [1]})
to_compression = "infer" if to_infer else compression
read_compression = "infer" if read_infer else compression
with tm.ensure_clean(filename) as path:
df.to_json(path, compression=to_compression)
result = pd.read_json(path, compression=read_compression)
tm.assert_frame_equal(result, df)
def test_to_json_compression_mode(compression):
# GH 39985 (read_json does not support user-provided binary files)
expected = pd.DataFrame({"A": [1]})
with BytesIO() as buffer:
expected.to_json(buffer, compression=compression)
# df = pd.read_json(buffer, compression=compression)
# tm.assert_frame_equal(expected, df)

View File

@@ -0,0 +1,21 @@
"""
Tests for the deprecated keyword arguments for `read_json`.
"""
from io import StringIO
import pandas as pd
import pandas._testing as tm
from pandas.io.json import read_json
def test_good_kwargs():
df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2])
with tm.assert_produces_warning(None):
data1 = StringIO(df.to_json(orient="split"))
tm.assert_frame_equal(df, read_json(data1, orient="split"))
data2 = StringIO(df.to_json(orient="columns"))
tm.assert_frame_equal(df, read_json(data2, orient="columns"))
data3 = StringIO(df.to_json(orient="index"))
tm.assert_frame_equal(df, read_json(data3, orient="index"))

View File

@@ -0,0 +1,847 @@
"""Tests for Table Schema integration."""
from collections import OrderedDict
from io import StringIO
import json
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
DatetimeTZDtype,
PeriodDtype,
)
import pandas as pd
from pandas import DataFrame
import pandas._testing as tm
from pandas.io.json._table_schema import (
as_json_table_type,
build_table_schema,
convert_json_field_to_pandas_type,
convert_pandas_type_to_json_field,
set_default_names,
)
@pytest.fixture
def df_schema():
return DataFrame(
{
"A": [1, 2, 3, 4],
"B": ["a", "b", "c", "c"],
"C": pd.date_range("2016-01-01", freq="d", periods=4),
"D": pd.timedelta_range("1H", periods=4, freq="T"),
},
index=pd.Index(range(4), name="idx"),
)
@pytest.fixture
def df_table():
return DataFrame(
{
"A": [1, 2, 3, 4],
"B": ["a", "b", "c", "c"],
"C": pd.date_range("2016-01-01", freq="d", periods=4),
"D": pd.timedelta_range("1H", periods=4, freq="T"),
"E": pd.Series(pd.Categorical(["a", "b", "c", "c"])),
"F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)),
"G": [1.0, 2.0, 3, 4.0],
"H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"),
},
index=pd.Index(range(4), name="idx"),
)
class TestBuildSchema:
def test_build_table_schema(self, df_schema):
result = build_table_schema(df_schema, version=False)
expected = {
"fields": [
{"name": "idx", "type": "integer"},
{"name": "A", "type": "integer"},
{"name": "B", "type": "string"},
{"name": "C", "type": "datetime"},
{"name": "D", "type": "duration"},
],
"primaryKey": ["idx"],
}
assert result == expected
result = build_table_schema(df_schema)
assert "pandas_version" in result
def test_series(self):
s = pd.Series([1, 2, 3], name="foo")
result = build_table_schema(s, version=False)
expected = {
"fields": [
{"name": "index", "type": "integer"},
{"name": "foo", "type": "integer"},
],
"primaryKey": ["index"],
}
assert result == expected
result = build_table_schema(s)
assert "pandas_version" in result
def test_series_unnamed(self):
result = build_table_schema(pd.Series([1, 2, 3]), version=False)
expected = {
"fields": [
{"name": "index", "type": "integer"},
{"name": "values", "type": "integer"},
],
"primaryKey": ["index"],
}
assert result == expected
def test_multiindex(self, df_schema):
df = df_schema
idx = pd.MultiIndex.from_product([("a", "b"), (1, 2)])
df.index = idx
result = build_table_schema(df, version=False)
expected = {
"fields": [
{"name": "level_0", "type": "string"},
{"name": "level_1", "type": "integer"},
{"name": "A", "type": "integer"},
{"name": "B", "type": "string"},
{"name": "C", "type": "datetime"},
{"name": "D", "type": "duration"},
],
"primaryKey": ["level_0", "level_1"],
}
assert result == expected
df.index.names = ["idx0", None]
expected["fields"][0]["name"] = "idx0"
expected["primaryKey"] = ["idx0", "level_1"]
result = build_table_schema(df, version=False)
assert result == expected
class TestTableSchemaType:
@pytest.mark.parametrize("int_type", [int, np.int16, np.int32, np.int64])
def test_as_json_table_type_int_data(self, int_type):
int_data = [1, 2, 3]
assert as_json_table_type(np.array(int_data, dtype=int_type).dtype) == "integer"
@pytest.mark.parametrize("float_type", [float, np.float16, np.float32, np.float64])
def test_as_json_table_type_float_data(self, float_type):
float_data = [1.0, 2.0, 3.0]
assert (
as_json_table_type(np.array(float_data, dtype=float_type).dtype) == "number"
)
@pytest.mark.parametrize("bool_type", [bool, np.bool_])
def test_as_json_table_type_bool_data(self, bool_type):
bool_data = [True, False]
assert (
as_json_table_type(np.array(bool_data, dtype=bool_type).dtype) == "boolean"
)
@pytest.mark.parametrize(
"date_data",
[
pd.to_datetime(["2016"]),
pd.to_datetime(["2016"], utc=True),
pd.Series(pd.to_datetime(["2016"])),
pd.Series(pd.to_datetime(["2016"], utc=True)),
pd.period_range("2016", freq="A", periods=3),
],
)
def test_as_json_table_type_date_data(self, date_data):
assert as_json_table_type(date_data.dtype) == "datetime"
@pytest.mark.parametrize("str_data", [pd.Series(["a", "b"]), pd.Index(["a", "b"])])
def test_as_json_table_type_string_data(self, str_data):
assert as_json_table_type(str_data.dtype) == "string"
@pytest.mark.parametrize(
"cat_data",
[
pd.Categorical(["a"]),
pd.Categorical([1]),
pd.Series(pd.Categorical([1])),
pd.CategoricalIndex([1]),
pd.Categorical([1]),
],
)
def test_as_json_table_type_categorical_data(self, cat_data):
assert as_json_table_type(cat_data.dtype) == "any"
# ------
# dtypes
# ------
@pytest.mark.parametrize("int_dtype", [int, np.int16, np.int32, np.int64])
def test_as_json_table_type_int_dtypes(self, int_dtype):
assert as_json_table_type(int_dtype) == "integer"
@pytest.mark.parametrize("float_dtype", [float, np.float16, np.float32, np.float64])
def test_as_json_table_type_float_dtypes(self, float_dtype):
assert as_json_table_type(float_dtype) == "number"
@pytest.mark.parametrize("bool_dtype", [bool, np.bool_])
def test_as_json_table_type_bool_dtypes(self, bool_dtype):
assert as_json_table_type(bool_dtype) == "boolean"
@pytest.mark.parametrize(
"date_dtype",
[
np.dtype("<M8[ns]"),
PeriodDtype("D"),
DatetimeTZDtype("ns", "US/Central"),
],
)
def test_as_json_table_type_date_dtypes(self, date_dtype):
# TODO: datedate.date? datetime.time?
assert as_json_table_type(date_dtype) == "datetime"
@pytest.mark.parametrize("td_dtype", [np.dtype("<m8[ns]")])
def test_as_json_table_type_timedelta_dtypes(self, td_dtype):
assert as_json_table_type(td_dtype) == "duration"
@pytest.mark.parametrize("str_dtype", [object]) # TODO(GH#14904) flesh out dtypes?
def test_as_json_table_type_string_dtypes(self, str_dtype):
assert as_json_table_type(str_dtype) == "string"
def test_as_json_table_type_categorical_dtypes(self):
assert as_json_table_type(pd.Categorical(["a"]).dtype) == "any"
assert as_json_table_type(CategoricalDtype()) == "any"
class TestTableOrient:
def test_build_series(self):
s = pd.Series([1, 2], name="a")
s.index.name = "id"
result = s.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [{"name": "id", "type": "integer"}, {"name": "a", "type": "integer"}]
schema = {"fields": fields, "primaryKey": ["id"]}
expected = OrderedDict(
[
("schema", schema),
(
"data",
[
OrderedDict([("id", 0), ("a", 1)]),
OrderedDict([("id", 1), ("a", 2)]),
],
),
]
)
assert result == expected
def test_read_json_from_to_json_results(self):
# GH32383
df = DataFrame(
{
"_id": {"row_0": 0},
"category": {"row_0": "Goods"},
"recommender_id": {"row_0": 3},
"recommender_name_jp": {"row_0": "浦田"},
"recommender_name_en": {"row_0": "Urata"},
"name_jp": {"row_0": "博多人形(松尾吉将まつお よしまさ)"},
"name_en": {"row_0": "Hakata Dolls Matsuo"},
}
)
result1 = pd.read_json(StringIO(df.to_json()))
result2 = DataFrame.from_dict(json.loads(df.to_json()))
tm.assert_frame_equal(result1, df)
tm.assert_frame_equal(result2, df)
def test_to_json(self, df_table):
df = df_table
df.index.name = "idx"
result = df.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [
{"name": "idx", "type": "integer"},
{"name": "A", "type": "integer"},
{"name": "B", "type": "string"},
{"name": "C", "type": "datetime"},
{"name": "D", "type": "duration"},
{
"constraints": {"enum": ["a", "b", "c"]},
"name": "E",
"ordered": False,
"type": "any",
},
{
"constraints": {"enum": ["a", "b", "c"]},
"name": "F",
"ordered": True,
"type": "any",
},
{"name": "G", "type": "number"},
{"name": "H", "type": "datetime", "tz": "US/Central"},
]
schema = {"fields": fields, "primaryKey": ["idx"]}
data = [
OrderedDict(
[
("idx", 0),
("A", 1),
("B", "a"),
("C", "2016-01-01T00:00:00.000"),
("D", "P0DT1H0M0S"),
("E", "a"),
("F", "a"),
("G", 1.0),
("H", "2016-01-01T06:00:00.000Z"),
]
),
OrderedDict(
[
("idx", 1),
("A", 2),
("B", "b"),
("C", "2016-01-02T00:00:00.000"),
("D", "P0DT1H1M0S"),
("E", "b"),
("F", "b"),
("G", 2.0),
("H", "2016-01-02T06:00:00.000Z"),
]
),
OrderedDict(
[
("idx", 2),
("A", 3),
("B", "c"),
("C", "2016-01-03T00:00:00.000"),
("D", "P0DT1H2M0S"),
("E", "c"),
("F", "c"),
("G", 3.0),
("H", "2016-01-03T06:00:00.000Z"),
]
),
OrderedDict(
[
("idx", 3),
("A", 4),
("B", "c"),
("C", "2016-01-04T00:00:00.000"),
("D", "P0DT1H3M0S"),
("E", "c"),
("F", "c"),
("G", 4.0),
("H", "2016-01-04T06:00:00.000Z"),
]
),
]
expected = OrderedDict([("schema", schema), ("data", data)])
assert result == expected
def test_to_json_float_index(self):
data = pd.Series(1, index=[1.0, 2.0])
result = data.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
result["schema"].pop("pandas_version")
expected = OrderedDict(
[
(
"schema",
{
"fields": [
{"name": "index", "type": "number"},
{"name": "values", "type": "integer"},
],
"primaryKey": ["index"],
},
),
(
"data",
[
OrderedDict([("index", 1.0), ("values", 1)]),
OrderedDict([("index", 2.0), ("values", 1)]),
],
),
]
)
assert result == expected
def test_to_json_period_index(self):
idx = pd.period_range("2016", freq="Q-JAN", periods=2)
data = pd.Series(1, idx)
result = data.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
result["schema"].pop("pandas_version")
fields = [
{"freq": "Q-JAN", "name": "index", "type": "datetime"},
{"name": "values", "type": "integer"},
]
schema = {"fields": fields, "primaryKey": ["index"]}
data = [
OrderedDict([("index", "2015-11-01T00:00:00.000"), ("values", 1)]),
OrderedDict([("index", "2016-02-01T00:00:00.000"), ("values", 1)]),
]
expected = OrderedDict([("schema", schema), ("data", data)])
assert result == expected
def test_to_json_categorical_index(self):
data = pd.Series(1, pd.CategoricalIndex(["a", "b"]))
result = data.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
result["schema"].pop("pandas_version")
expected = OrderedDict(
[
(
"schema",
{
"fields": [
{
"name": "index",
"type": "any",
"constraints": {"enum": ["a", "b"]},
"ordered": False,
},
{"name": "values", "type": "integer"},
],
"primaryKey": ["index"],
},
),
(
"data",
[
OrderedDict([("index", "a"), ("values", 1)]),
OrderedDict([("index", "b"), ("values", 1)]),
],
),
]
)
assert result == expected
def test_date_format_raises(self, df_table):
msg = (
"Trying to write with `orient='table'` and `date_format='epoch'`. Table "
"Schema requires dates to be formatted with `date_format='iso'`"
)
with pytest.raises(ValueError, match=msg):
df_table.to_json(orient="table", date_format="epoch")
# others work
df_table.to_json(orient="table", date_format="iso")
df_table.to_json(orient="table")
def test_convert_pandas_type_to_json_field_int(self, index_or_series):
kind = index_or_series
data = [1, 2, 3]
result = convert_pandas_type_to_json_field(kind(data, name="name"))
expected = {"name": "name", "type": "integer"}
assert result == expected
def test_convert_pandas_type_to_json_field_float(self, index_or_series):
kind = index_or_series
data = [1.0, 2.0, 3.0]
result = convert_pandas_type_to_json_field(kind(data, name="name"))
expected = {"name": "name", "type": "number"}
assert result == expected
@pytest.mark.parametrize(
"dt_args,extra_exp", [({}, {}), ({"utc": True}, {"tz": "UTC"})]
)
@pytest.mark.parametrize("wrapper", [None, pd.Series])
def test_convert_pandas_type_to_json_field_datetime(
self, dt_args, extra_exp, wrapper
):
data = [1.0, 2.0, 3.0]
data = pd.to_datetime(data, **dt_args)
if wrapper is pd.Series:
data = pd.Series(data, name="values")
result = convert_pandas_type_to_json_field(data)
expected = {"name": "values", "type": "datetime"}
expected.update(extra_exp)
assert result == expected
def test_convert_pandas_type_to_json_period_range(self):
arr = pd.period_range("2016", freq="A-DEC", periods=4)
result = convert_pandas_type_to_json_field(arr)
expected = {"name": "values", "type": "datetime", "freq": "A-DEC"}
assert result == expected
@pytest.mark.parametrize("kind", [pd.Categorical, pd.CategoricalIndex])
@pytest.mark.parametrize("ordered", [True, False])
def test_convert_pandas_type_to_json_field_categorical(self, kind, ordered):
data = ["a", "b", "c"]
if kind is pd.Categorical:
arr = pd.Series(kind(data, ordered=ordered), name="cats")
elif kind is pd.CategoricalIndex:
arr = kind(data, ordered=ordered, name="cats")
result = convert_pandas_type_to_json_field(arr)
expected = {
"name": "cats",
"type": "any",
"constraints": {"enum": data},
"ordered": ordered,
}
assert result == expected
@pytest.mark.parametrize(
"inp,exp",
[
({"type": "integer"}, "int64"),
({"type": "number"}, "float64"),
({"type": "boolean"}, "bool"),
({"type": "duration"}, "timedelta64"),
({"type": "datetime"}, "datetime64[ns]"),
({"type": "datetime", "tz": "US/Hawaii"}, "datetime64[ns, US/Hawaii]"),
({"type": "any"}, "object"),
(
{
"type": "any",
"constraints": {"enum": ["a", "b", "c"]},
"ordered": False,
},
CategoricalDtype(categories=["a", "b", "c"], ordered=False),
),
(
{
"type": "any",
"constraints": {"enum": ["a", "b", "c"]},
"ordered": True,
},
CategoricalDtype(categories=["a", "b", "c"], ordered=True),
),
({"type": "string"}, "object"),
],
)
def test_convert_json_field_to_pandas_type(self, inp, exp):
field = {"name": "foo"}
field.update(inp)
assert convert_json_field_to_pandas_type(field) == exp
@pytest.mark.parametrize("inp", ["geopoint", "geojson", "fake_type"])
def test_convert_json_field_to_pandas_type_raises(self, inp):
field = {"type": inp}
with pytest.raises(
ValueError, match=f"Unsupported or invalid field type: {inp}"
):
convert_json_field_to_pandas_type(field)
def test_categorical(self):
s = pd.Series(pd.Categorical(["a", "b", "a"]))
s.index.name = "idx"
result = s.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
result["schema"].pop("pandas_version")
fields = [
{"name": "idx", "type": "integer"},
{
"constraints": {"enum": ["a", "b"]},
"name": "values",
"ordered": False,
"type": "any",
},
]
expected = OrderedDict(
[
("schema", {"fields": fields, "primaryKey": ["idx"]}),
(
"data",
[
OrderedDict([("idx", 0), ("values", "a")]),
OrderedDict([("idx", 1), ("values", "b")]),
OrderedDict([("idx", 2), ("values", "a")]),
],
),
]
)
assert result == expected
@pytest.mark.parametrize(
"idx,nm,prop",
[
(pd.Index([1]), "index", "name"),
(pd.Index([1], name="myname"), "myname", "name"),
(
pd.MultiIndex.from_product([("a", "b"), ("c", "d")]),
["level_0", "level_1"],
"names",
),
(
pd.MultiIndex.from_product(
[("a", "b"), ("c", "d")], names=["n1", "n2"]
),
["n1", "n2"],
"names",
),
(
pd.MultiIndex.from_product(
[("a", "b"), ("c", "d")], names=["n1", None]
),
["n1", "level_1"],
"names",
),
],
)
def test_set_names_unset(self, idx, nm, prop):
data = pd.Series(1, idx)
result = set_default_names(data)
assert getattr(result.index, prop) == nm
@pytest.mark.parametrize(
"idx",
[
pd.Index([], name="index"),
pd.MultiIndex.from_arrays([["foo"], ["bar"]], names=("level_0", "level_1")),
pd.MultiIndex.from_arrays([["foo"], ["bar"]], names=("foo", "level_1")),
],
)
def test_warns_non_roundtrippable_names(self, idx):
# GH 19130
df = DataFrame(index=idx)
df.index.name = "index"
with tm.assert_produces_warning():
set_default_names(df)
def test_timestamp_in_columns(self):
df = DataFrame(
[[1, 2]], columns=[pd.Timestamp("2016"), pd.Timedelta(10, unit="s")]
)
result = df.to_json(orient="table")
js = json.loads(result)
assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000"
assert js["schema"]["fields"][2]["name"] == "P0DT0H0M10S"
@pytest.mark.parametrize(
"case",
[
pd.Series([1], index=pd.Index([1], name="a"), name="a"),
DataFrame({"A": [1]}, index=pd.Index([1], name="A")),
DataFrame(
{"A": [1]},
index=pd.MultiIndex.from_arrays([["a"], [1]], names=["A", "a"]),
),
],
)
def test_overlapping_names(self, case):
with pytest.raises(ValueError, match="Overlapping"):
case.to_json(orient="table")
def test_mi_falsey_name(self):
# GH 16203
df = DataFrame(
np.random.default_rng(2).standard_normal((4, 4)),
index=pd.MultiIndex.from_product([("A", "B"), ("a", "b")]),
)
result = [x["name"] for x in build_table_schema(df)["fields"]]
assert result == ["level_0", "level_1", 0, 1, 2, 3]
class TestTableOrientReader:
@pytest.mark.parametrize(
"index_nm",
[None, "idx", pytest.param("index", marks=pytest.mark.xfail), "level_0"],
)
@pytest.mark.parametrize(
"vals",
[
{"ints": [1, 2, 3, 4]},
{"objects": ["a", "b", "c", "d"]},
{"objects": ["1", "2", "3", "4"]},
{"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)},
{"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))},
{
"ordered_cats": pd.Series(
pd.Categorical(["a", "b", "c", "c"], ordered=True)
)
},
{"floats": [1.0, 2.0, 3.0, 4.0]},
{"floats": [1.1, 2.2, 3.3, 4.4]},
{"bools": [True, False, False, True]},
{
"timezones": pd.date_range(
"2016-01-01", freq="d", periods=4, tz="US/Central"
) # added in # GH 35973
},
],
)
def test_read_json_table_orient(self, index_nm, vals, recwarn):
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
out = df.to_json(orient="table")
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
@pytest.mark.parametrize("index_nm", [None, "idx", "index"])
@pytest.mark.parametrize(
"vals",
[{"timedeltas": pd.timedelta_range("1H", periods=4, freq="T")}],
)
def test_read_json_table_orient_raises(self, index_nm, vals, recwarn):
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
out = df.to_json(orient="table")
with pytest.raises(NotImplementedError, match="can not yet read "):
pd.read_json(out, orient="table")
@pytest.mark.parametrize(
"index_nm",
[None, "idx", pytest.param("index", marks=pytest.mark.xfail), "level_0"],
)
@pytest.mark.parametrize(
"vals",
[
{"ints": [1, 2, 3, 4]},
{"objects": ["a", "b", "c", "d"]},
{"objects": ["1", "2", "3", "4"]},
{"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)},
{"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))},
{
"ordered_cats": pd.Series(
pd.Categorical(["a", "b", "c", "c"], ordered=True)
)
},
{"floats": [1.0, 2.0, 3.0, 4.0]},
{"floats": [1.1, 2.2, 3.3, 4.4]},
{"bools": [True, False, False, True]},
{
"timezones": pd.date_range(
"2016-01-01", freq="d", periods=4, tz="US/Central"
) # added in # GH 35973
},
],
)
def test_read_json_table_period_orient(self, index_nm, vals, recwarn):
df = DataFrame(
vals,
index=pd.Index(
(pd.Period(f"2022Q{q}") for q in range(1, 5)), name=index_nm
),
)
out = df.to_json(orient="table")
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
@pytest.mark.parametrize(
"idx",
[
pd.Index(range(4)),
pd.date_range(
"2020-08-30",
freq="d",
periods=4,
)._with_freq(None),
pd.date_range(
"2020-08-30", freq="d", periods=4, tz="US/Central"
)._with_freq(None),
pd.MultiIndex.from_product(
[
pd.date_range("2020-08-30", freq="d", periods=2, tz="US/Central"),
["x", "y"],
],
),
],
)
@pytest.mark.parametrize(
"vals",
[
{"floats": [1.1, 2.2, 3.3, 4.4]},
{"dates": pd.date_range("2020-08-30", freq="d", periods=4)},
{
"timezones": pd.date_range(
"2020-08-30", freq="d", periods=4, tz="Europe/London"
)
},
],
)
def test_read_json_table_timezones_orient(self, idx, vals, recwarn):
# GH 35973
df = DataFrame(vals, index=idx)
out = df.to_json(orient="table")
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
def test_comprehensive(self):
df = DataFrame(
{
"A": [1, 2, 3, 4],
"B": ["a", "b", "c", "c"],
"C": pd.date_range("2016-01-01", freq="d", periods=4),
# 'D': pd.timedelta_range('1H', periods=4, freq='T'),
"E": pd.Series(pd.Categorical(["a", "b", "c", "c"])),
"F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)),
"G": [1.1, 2.2, 3.3, 4.4],
"H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"),
"I": [True, False, False, True],
},
index=pd.Index(range(4), name="idx"),
)
out = StringIO(df.to_json(orient="table"))
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
@pytest.mark.parametrize(
"index_names",
[[None, None], ["foo", "bar"], ["foo", None], [None, "foo"], ["index", "foo"]],
)
def test_multiindex(self, index_names):
# GH 18912
df = DataFrame(
[["Arr", "alpha", [1, 2, 3, 4]], ["Bee", "Beta", [10, 20, 30, 40]]],
index=[["A", "B"], ["Null", "Eins"]],
columns=["Aussprache", "Griechisch", "Args"],
)
df.index.names = index_names
out = StringIO(df.to_json(orient="table"))
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
def test_empty_frame_roundtrip(self):
# GH 21287
df = DataFrame(columns=["a", "b", "c"])
expected = df.copy()
out = StringIO(df.to_json(orient="table"))
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(expected, result)
def test_read_json_orient_table_old_schema_version(self):
df_json = """
{
"schema":{
"fields":[
{"name":"index","type":"integer"},
{"name":"a","type":"string"}
],
"primaryKey":["index"],
"pandas_version":"0.20.0"
},
"data":[
{"index":0,"a":1},
{"index":1,"a":2.0},
{"index":2,"a":"s"}
]
}
"""
expected = DataFrame({"a": [1, 2.0, "s"]})
result = pd.read_json(StringIO(df_json), orient="table")
tm.assert_frame_equal(expected, result)

View File

@@ -0,0 +1,317 @@
"""Tests for ExtensionDtype Table Schema integration."""
from collections import OrderedDict
import datetime as dt
import decimal
from io import StringIO
import json
import pytest
from pandas import (
NA,
DataFrame,
Index,
array,
read_json,
)
import pandas._testing as tm
from pandas.core.arrays.integer import Int64Dtype
from pandas.core.arrays.string_ import StringDtype
from pandas.core.series import Series
from pandas.tests.extension.date import (
DateArray,
DateDtype,
)
from pandas.tests.extension.decimal.array import (
DecimalArray,
DecimalDtype,
)
from pandas.io.json._table_schema import (
as_json_table_type,
build_table_schema,
)
class TestBuildSchema:
def test_build_table_schema(self):
df = DataFrame(
{
"A": DateArray([dt.date(2021, 10, 10)]),
"B": DecimalArray([decimal.Decimal(10)]),
"C": array(["pandas"], dtype="string"),
"D": array([10], dtype="Int64"),
}
)
result = build_table_schema(df, version=False)
expected = {
"fields": [
{"name": "index", "type": "integer"},
{"name": "A", "type": "any", "extDtype": "DateDtype"},
{"name": "B", "type": "number", "extDtype": "decimal"},
{"name": "C", "type": "any", "extDtype": "string"},
{"name": "D", "type": "integer", "extDtype": "Int64"},
],
"primaryKey": ["index"],
}
assert result == expected
result = build_table_schema(df)
assert "pandas_version" in result
class TestTableSchemaType:
@pytest.mark.parametrize(
"date_data",
[
DateArray([dt.date(2021, 10, 10)]),
DateArray(dt.date(2021, 10, 10)),
Series(DateArray(dt.date(2021, 10, 10))),
],
)
def test_as_json_table_type_ext_date_array_dtype(self, date_data):
assert as_json_table_type(date_data.dtype) == "any"
def test_as_json_table_type_ext_date_dtype(self):
assert as_json_table_type(DateDtype()) == "any"
@pytest.mark.parametrize(
"decimal_data",
[
DecimalArray([decimal.Decimal(10)]),
Series(DecimalArray([decimal.Decimal(10)])),
],
)
def test_as_json_table_type_ext_decimal_array_dtype(self, decimal_data):
assert as_json_table_type(decimal_data.dtype) == "number"
def test_as_json_table_type_ext_decimal_dtype(self):
assert as_json_table_type(DecimalDtype()) == "number"
@pytest.mark.parametrize(
"string_data",
[
array(["pandas"], dtype="string"),
Series(array(["pandas"], dtype="string")),
],
)
def test_as_json_table_type_ext_string_array_dtype(self, string_data):
assert as_json_table_type(string_data.dtype) == "any"
def test_as_json_table_type_ext_string_dtype(self):
assert as_json_table_type(StringDtype()) == "any"
@pytest.mark.parametrize(
"integer_data",
[
array([10], dtype="Int64"),
Series(array([10], dtype="Int64")),
],
)
def test_as_json_table_type_ext_integer_array_dtype(self, integer_data):
assert as_json_table_type(integer_data.dtype) == "integer"
def test_as_json_table_type_ext_integer_dtype(self):
assert as_json_table_type(Int64Dtype()) == "integer"
class TestTableOrient:
@pytest.fixture
def da(self):
return DateArray([dt.date(2021, 10, 10)])
@pytest.fixture
def dc(self):
return DecimalArray([decimal.Decimal(10)])
@pytest.fixture
def sa(self):
return array(["pandas"], dtype="string")
@pytest.fixture
def ia(self):
return array([10], dtype="Int64")
@pytest.fixture
def df(self, da, dc, sa, ia):
return DataFrame(
{
"A": da,
"B": dc,
"C": sa,
"D": ia,
}
)
def test_build_date_series(self, da):
s = Series(da, name="a")
s.index.name = "id"
result = s.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [
{"name": "id", "type": "integer"},
{"name": "a", "type": "any", "extDtype": "DateDtype"},
]
schema = {"fields": fields, "primaryKey": ["id"]}
expected = OrderedDict(
[
("schema", schema),
("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000")])]),
]
)
assert result == expected
def test_build_decimal_series(self, dc):
s = Series(dc, name="a")
s.index.name = "id"
result = s.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [
{"name": "id", "type": "integer"},
{"name": "a", "type": "number", "extDtype": "decimal"},
]
schema = {"fields": fields, "primaryKey": ["id"]}
expected = OrderedDict(
[
("schema", schema),
("data", [OrderedDict([("id", 0), ("a", 10.0)])]),
]
)
assert result == expected
def test_build_string_series(self, sa):
s = Series(sa, name="a")
s.index.name = "id"
result = s.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [
{"name": "id", "type": "integer"},
{"name": "a", "type": "any", "extDtype": "string"},
]
schema = {"fields": fields, "primaryKey": ["id"]}
expected = OrderedDict(
[
("schema", schema),
("data", [OrderedDict([("id", 0), ("a", "pandas")])]),
]
)
assert result == expected
def test_build_int64_series(self, ia):
s = Series(ia, name="a")
s.index.name = "id"
result = s.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [
{"name": "id", "type": "integer"},
{"name": "a", "type": "integer", "extDtype": "Int64"},
]
schema = {"fields": fields, "primaryKey": ["id"]}
expected = OrderedDict(
[
("schema", schema),
("data", [OrderedDict([("id", 0), ("a", 10)])]),
]
)
assert result == expected
def test_to_json(self, df):
df = df.copy()
df.index.name = "idx"
result = df.to_json(orient="table", date_format="iso")
result = json.loads(result, object_pairs_hook=OrderedDict)
assert "pandas_version" in result["schema"]
result["schema"].pop("pandas_version")
fields = [
OrderedDict({"name": "idx", "type": "integer"}),
OrderedDict({"name": "A", "type": "any", "extDtype": "DateDtype"}),
OrderedDict({"name": "B", "type": "number", "extDtype": "decimal"}),
OrderedDict({"name": "C", "type": "any", "extDtype": "string"}),
OrderedDict({"name": "D", "type": "integer", "extDtype": "Int64"}),
]
schema = OrderedDict({"fields": fields, "primaryKey": ["idx"]})
data = [
OrderedDict(
[
("idx", 0),
("A", "2021-10-10T00:00:00.000"),
("B", 10.0),
("C", "pandas"),
("D", 10),
]
)
]
expected = OrderedDict([("schema", schema), ("data", data)])
assert result == expected
def test_json_ext_dtype_reading_roundtrip(self):
# GH#40255
df = DataFrame(
{
"a": Series([2, NA], dtype="Int64"),
"b": Series([1.5, NA], dtype="Float64"),
"c": Series([True, NA], dtype="boolean"),
},
index=Index([1, NA], dtype="Int64"),
)
expected = df.copy()
data_json = df.to_json(orient="table", indent=4)
result = read_json(StringIO(data_json), orient="table")
tm.assert_frame_equal(result, expected)
def test_json_ext_dtype_reading(self):
# GH#40255
data_json = """{
"schema":{
"fields":[
{
"name":"a",
"type":"integer",
"extDtype":"Int64"
}
],
},
"data":[
{
"a":2
},
{
"a":null
}
]
}"""
result = read_json(StringIO(data_json), orient="table")
expected = DataFrame({"a": Series([2, NA], dtype="Int64")})
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,907 @@
import json
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
Series,
json_normalize,
)
import pandas._testing as tm
from pandas.io.json._normalize import nested_to_record
@pytest.fixture
def deep_nested():
# deeply nested data
return [
{
"country": "USA",
"states": [
{
"name": "California",
"cities": [
{"name": "San Francisco", "pop": 12345},
{"name": "Los Angeles", "pop": 12346},
],
},
{
"name": "Ohio",
"cities": [
{"name": "Columbus", "pop": 1234},
{"name": "Cleveland", "pop": 1236},
],
},
],
},
{
"country": "Germany",
"states": [
{"name": "Bayern", "cities": [{"name": "Munich", "pop": 12347}]},
{
"name": "Nordrhein-Westfalen",
"cities": [
{"name": "Duesseldorf", "pop": 1238},
{"name": "Koeln", "pop": 1239},
],
},
],
},
]
@pytest.fixture
def state_data():
return [
{
"counties": [
{"name": "Dade", "population": 12345},
{"name": "Broward", "population": 40000},
{"name": "Palm Beach", "population": 60000},
],
"info": {"governor": "Rick Scott"},
"shortname": "FL",
"state": "Florida",
},
{
"counties": [
{"name": "Summit", "population": 1234},
{"name": "Cuyahoga", "population": 1337},
],
"info": {"governor": "John Kasich"},
"shortname": "OH",
"state": "Ohio",
},
]
@pytest.fixture
def author_missing_data():
return [
{"info": None},
{
"info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"},
"author_name": {"first": "Jane", "last_name": "Doe"},
},
]
@pytest.fixture
def missing_metadata():
return [
{
"name": "Alice",
"addresses": [
{
"number": 9562,
"street": "Morris St.",
"city": "Massillon",
"state": "OH",
"zip": 44646,
}
],
"previous_residences": {"cities": [{"city_name": "Foo York City"}]},
},
{
"addresses": [
{
"number": 8449,
"street": "Spring St.",
"city": "Elizabethton",
"state": "TN",
"zip": 37643,
}
],
"previous_residences": {"cities": [{"city_name": "Barmingham"}]},
},
]
@pytest.fixture
def max_level_test_input_data():
"""
input data to test json_normalize with max_level param
"""
return [
{
"CreatedBy": {"Name": "User001"},
"Lookup": {
"TextField": "Some text",
"UserField": {"Id": "ID001", "Name": "Name001"},
},
"Image": {"a": "b"},
}
]
class TestJSONNormalize:
def test_simple_records(self):
recs = [
{"a": 1, "b": 2, "c": 3},
{"a": 4, "b": 5, "c": 6},
{"a": 7, "b": 8, "c": 9},
{"a": 10, "b": 11, "c": 12},
]
result = json_normalize(recs)
expected = DataFrame(recs)
tm.assert_frame_equal(result, expected)
def test_simple_normalize(self, state_data):
result = json_normalize(state_data[0], "counties")
expected = DataFrame(state_data[0]["counties"])
tm.assert_frame_equal(result, expected)
result = json_normalize(state_data, "counties")
expected = []
for rec in state_data:
expected.extend(rec["counties"])
expected = DataFrame(expected)
tm.assert_frame_equal(result, expected)
result = json_normalize(state_data, "counties", meta="state")
expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2])
tm.assert_frame_equal(result, expected)
def test_fields_list_type_normalize(self):
parse_metadata_fields_list_type = [
{"values": [1, 2, 3], "metadata": {"listdata": [1, 2]}}
]
result = json_normalize(
parse_metadata_fields_list_type,
record_path=["values"],
meta=[["metadata", "listdata"]],
)
expected = DataFrame(
{0: [1, 2, 3], "metadata.listdata": [[1, 2], [1, 2], [1, 2]]}
)
tm.assert_frame_equal(result, expected)
def test_empty_array(self):
result = json_normalize([])
expected = DataFrame()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data, record_path, exception_type",
[
([{"a": 0}, {"a": 1}], None, None),
({"a": [{"a": 0}, {"a": 1}]}, "a", None),
('{"a": [{"a": 0}, {"a": 1}]}', None, NotImplementedError),
(None, None, NotImplementedError),
],
)
def test_accepted_input(self, data, record_path, exception_type):
if exception_type is not None:
with pytest.raises(exception_type, match=tm.EMPTY_STRING_PATTERN):
json_normalize(data, record_path=record_path)
else:
result = json_normalize(data, record_path=record_path)
expected = DataFrame([0, 1], columns=["a"])
tm.assert_frame_equal(result, expected)
def test_simple_normalize_with_separator(self, deep_nested):
# GH 14883
result = json_normalize({"A": {"A": 1, "B": 2}})
expected = DataFrame([[1, 2]], columns=["A.A", "A.B"])
tm.assert_frame_equal(result.reindex_like(expected), expected)
result = json_normalize({"A": {"A": 1, "B": 2}}, sep="_")
expected = DataFrame([[1, 2]], columns=["A_A", "A_B"])
tm.assert_frame_equal(result.reindex_like(expected), expected)
result = json_normalize({"A": {"A": 1, "B": 2}}, sep="\u03c3")
expected = DataFrame([[1, 2]], columns=["A\u03c3A", "A\u03c3B"])
tm.assert_frame_equal(result.reindex_like(expected), expected)
result = json_normalize(
deep_nested,
["states", "cities"],
meta=["country", ["states", "name"]],
sep="_",
)
expected = Index(["name", "pop", "country", "states_name"]).sort_values()
assert result.columns.sort_values().equals(expected)
def test_normalize_with_multichar_separator(self):
# GH #43831
data = {"a": [1, 2], "b": {"b_1": 2, "b_2": (3, 4)}}
result = json_normalize(data, sep="__")
expected = DataFrame([[[1, 2], 2, (3, 4)]], columns=["a", "b__b_1", "b__b_2"])
tm.assert_frame_equal(result, expected)
def test_value_array_record_prefix(self):
# GH 21536
result = json_normalize({"A": [1, 2]}, "A", record_prefix="Prefix.")
expected = DataFrame([[1], [2]], columns=["Prefix.0"])
tm.assert_frame_equal(result, expected)
def test_nested_object_record_path(self):
# GH 22706
data = {
"state": "Florida",
"info": {
"governor": "Rick Scott",
"counties": [
{"name": "Dade", "population": 12345},
{"name": "Broward", "population": 40000},
{"name": "Palm Beach", "population": 60000},
],
},
}
result = json_normalize(data, record_path=["info", "counties"])
expected = DataFrame(
[["Dade", 12345], ["Broward", 40000], ["Palm Beach", 60000]],
columns=["name", "population"],
)
tm.assert_frame_equal(result, expected)
def test_more_deeply_nested(self, deep_nested):
result = json_normalize(
deep_nested, ["states", "cities"], meta=["country", ["states", "name"]]
)
ex_data = {
"country": ["USA"] * 4 + ["Germany"] * 3,
"states.name": [
"California",
"California",
"Ohio",
"Ohio",
"Bayern",
"Nordrhein-Westfalen",
"Nordrhein-Westfalen",
],
"name": [
"San Francisco",
"Los Angeles",
"Columbus",
"Cleveland",
"Munich",
"Duesseldorf",
"Koeln",
],
"pop": [12345, 12346, 1234, 1236, 12347, 1238, 1239],
}
expected = DataFrame(ex_data, columns=result.columns)
tm.assert_frame_equal(result, expected)
def test_shallow_nested(self):
data = [
{
"state": "Florida",
"shortname": "FL",
"info": {"governor": "Rick Scott"},
"counties": [
{"name": "Dade", "population": 12345},
{"name": "Broward", "population": 40000},
{"name": "Palm Beach", "population": 60000},
],
},
{
"state": "Ohio",
"shortname": "OH",
"info": {"governor": "John Kasich"},
"counties": [
{"name": "Summit", "population": 1234},
{"name": "Cuyahoga", "population": 1337},
],
},
]
result = json_normalize(
data, "counties", ["state", "shortname", ["info", "governor"]]
)
ex_data = {
"name": ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"],
"state": ["Florida"] * 3 + ["Ohio"] * 2,
"shortname": ["FL", "FL", "FL", "OH", "OH"],
"info.governor": ["Rick Scott"] * 3 + ["John Kasich"] * 2,
"population": [12345, 40000, 60000, 1234, 1337],
}
expected = DataFrame(ex_data, columns=result.columns)
tm.assert_frame_equal(result, expected)
def test_nested_meta_path_with_nested_record_path(self, state_data):
# GH 27220
result = json_normalize(
data=state_data,
record_path=["counties"],
meta=["state", "shortname", ["info", "governor"]],
errors="ignore",
)
ex_data = {
"name": ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"],
"population": [12345, 40000, 60000, 1234, 1337],
"state": ["Florida"] * 3 + ["Ohio"] * 2,
"shortname": ["FL"] * 3 + ["OH"] * 2,
"info.governor": ["Rick Scott"] * 3 + ["John Kasich"] * 2,
}
expected = DataFrame(ex_data)
tm.assert_frame_equal(result, expected)
def test_meta_name_conflict(self):
data = [
{
"foo": "hello",
"bar": "there",
"data": [
{"foo": "something", "bar": "else"},
{"foo": "something2", "bar": "else2"},
],
}
]
msg = r"Conflicting metadata name (foo|bar), need distinguishing prefix"
with pytest.raises(ValueError, match=msg):
json_normalize(data, "data", meta=["foo", "bar"])
result = json_normalize(data, "data", meta=["foo", "bar"], meta_prefix="meta")
for val in ["metafoo", "metabar", "foo", "bar"]:
assert val in result
def test_meta_parameter_not_modified(self):
# GH 18610
data = [
{
"foo": "hello",
"bar": "there",
"data": [
{"foo": "something", "bar": "else"},
{"foo": "something2", "bar": "else2"},
],
}
]
COLUMNS = ["foo", "bar"]
result = json_normalize(data, "data", meta=COLUMNS, meta_prefix="meta")
assert COLUMNS == ["foo", "bar"]
for val in ["metafoo", "metabar", "foo", "bar"]:
assert val in result
def test_record_prefix(self, state_data):
result = json_normalize(state_data[0], "counties")
expected = DataFrame(state_data[0]["counties"])
tm.assert_frame_equal(result, expected)
result = json_normalize(
state_data, "counties", meta="state", record_prefix="county_"
)
expected = []
for rec in state_data:
expected.extend(rec["counties"])
expected = DataFrame(expected)
expected = expected.rename(columns=lambda x: "county_" + x)
expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2])
tm.assert_frame_equal(result, expected)
def test_non_ascii_key(self):
testjson = (
b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},'
b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
).decode("utf8")
testdata = {
b"\xc3\x9cnic\xc3\xb8de".decode("utf8"): [0, 1],
"sub.A": [1, 3],
"sub.B": [2, 4],
}
expected = DataFrame(testdata)
result = json_normalize(json.loads(testjson))
tm.assert_frame_equal(result, expected)
def test_missing_field(self, author_missing_data):
# GH20030:
result = json_normalize(author_missing_data)
ex_data = [
{
"info": np.nan,
"info.created_at": np.nan,
"info.last_updated": np.nan,
"author_name.first": np.nan,
"author_name.last_name": np.nan,
},
{
"info": None,
"info.created_at": "11/08/1993",
"info.last_updated": "26/05/2012",
"author_name.first": "Jane",
"author_name.last_name": "Doe",
},
]
expected = DataFrame(ex_data)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"max_level,expected",
[
(
0,
[
{
"TextField": "Some text",
"UserField": {"Id": "ID001", "Name": "Name001"},
"CreatedBy": {"Name": "User001"},
"Image": {"a": "b"},
},
{
"TextField": "Some text",
"UserField": {"Id": "ID001", "Name": "Name001"},
"CreatedBy": {"Name": "User001"},
"Image": {"a": "b"},
},
],
),
(
1,
[
{
"TextField": "Some text",
"UserField.Id": "ID001",
"UserField.Name": "Name001",
"CreatedBy": {"Name": "User001"},
"Image": {"a": "b"},
},
{
"TextField": "Some text",
"UserField.Id": "ID001",
"UserField.Name": "Name001",
"CreatedBy": {"Name": "User001"},
"Image": {"a": "b"},
},
],
),
],
)
def test_max_level_with_records_path(self, max_level, expected):
# GH23843: Enhanced JSON normalize
test_input = [
{
"CreatedBy": {"Name": "User001"},
"Lookup": [
{
"TextField": "Some text",
"UserField": {"Id": "ID001", "Name": "Name001"},
},
{
"TextField": "Some text",
"UserField": {"Id": "ID001", "Name": "Name001"},
},
],
"Image": {"a": "b"},
"tags": [
{"foo": "something", "bar": "else"},
{"foo": "something2", "bar": "else2"},
],
}
]
result = json_normalize(
test_input,
record_path=["Lookup"],
meta=[["CreatedBy"], ["Image"]],
max_level=max_level,
)
expected_df = DataFrame(data=expected, columns=result.columns.values)
tm.assert_equal(expected_df, result)
def test_nested_flattening_consistent(self):
# see gh-21537
df1 = json_normalize([{"A": {"B": 1}}])
df2 = json_normalize({"dummy": [{"A": {"B": 1}}]}, "dummy")
# They should be the same.
tm.assert_frame_equal(df1, df2)
def test_nonetype_record_path(self, nulls_fixture):
# see gh-30148
# should not raise TypeError
result = json_normalize(
[
{"state": "Texas", "info": nulls_fixture},
{"state": "Florida", "info": [{"i": 2}]},
],
record_path=["info"],
)
expected = DataFrame({"i": 2}, index=[0])
tm.assert_equal(result, expected)
@pytest.mark.parametrize("value", ["false", "true", "{}", "1", '"text"'])
def test_non_list_record_path_errors(self, value):
# see gh-30148, GH 26284
parsed_value = json.loads(value)
test_input = {"state": "Texas", "info": parsed_value}
test_path = "info"
msg = (
f"{test_input} has non list value {parsed_value} for path {test_path}. "
"Must be list or null."
)
with pytest.raises(TypeError, match=msg):
json_normalize([test_input], record_path=[test_path])
def test_meta_non_iterable(self):
# GH 31507
data = """[{"id": 99, "data": [{"one": 1, "two": 2}]}]"""
result = json_normalize(json.loads(data), record_path=["data"], meta=["id"])
expected = DataFrame(
{"one": [1], "two": [2], "id": np.array([99], dtype=object)}
)
tm.assert_frame_equal(result, expected)
def test_generator(self, state_data):
# GH35923 Fix pd.json_normalize to not skip the first element of a
# generator input
def generator_data():
yield from state_data[0]["counties"]
result = json_normalize(generator_data())
expected = DataFrame(state_data[0]["counties"])
tm.assert_frame_equal(result, expected)
def test_top_column_with_leading_underscore(self):
# 49861
data = {"_id": {"a1": 10, "l2": {"l3": 0}}, "gg": 4}
result = json_normalize(data, sep="_")
expected = DataFrame([[4, 10, 0]], columns=["gg", "_id_a1", "_id_l2_l3"])
tm.assert_frame_equal(result, expected)
class TestNestedToRecord:
def test_flat_stays_flat(self):
recs = [{"flat1": 1, "flat2": 2}, {"flat3": 3, "flat2": 4}]
result = nested_to_record(recs)
expected = recs
assert result == expected
def test_one_level_deep_flattens(self):
data = {"flat1": 1, "dict1": {"c": 1, "d": 2}}
result = nested_to_record(data)
expected = {"dict1.c": 1, "dict1.d": 2, "flat1": 1}
assert result == expected
def test_nested_flattens(self):
data = {
"flat1": 1,
"dict1": {"c": 1, "d": 2},
"nested": {"e": {"c": 1, "d": 2}, "d": 2},
}
result = nested_to_record(data)
expected = {
"dict1.c": 1,
"dict1.d": 2,
"flat1": 1,
"nested.d": 2,
"nested.e.c": 1,
"nested.e.d": 2,
}
assert result == expected
def test_json_normalize_errors(self, missing_metadata):
# GH14583:
# If meta keys are not always present a new option to set
# errors='ignore' has been implemented
msg = (
"Key 'name' not found. To replace missing values of "
"'name' with np.nan, pass in errors='ignore'"
)
with pytest.raises(KeyError, match=msg):
json_normalize(
data=missing_metadata,
record_path="addresses",
meta="name",
errors="raise",
)
def test_missing_meta(self, missing_metadata):
# GH25468
# If metadata is nullable with errors set to ignore, the null values
# should be numpy.nan values
result = json_normalize(
data=missing_metadata, record_path="addresses", meta="name", errors="ignore"
)
ex_data = [
[9562, "Morris St.", "Massillon", "OH", 44646, "Alice"],
[8449, "Spring St.", "Elizabethton", "TN", 37643, np.nan],
]
columns = ["number", "street", "city", "state", "zip", "name"]
expected = DataFrame(ex_data, columns=columns)
tm.assert_frame_equal(result, expected)
def test_missing_nested_meta(self):
# GH44312
# If errors="ignore" and nested metadata is null, we should return nan
data = {"meta": "foo", "nested_meta": None, "value": [{"rec": 1}, {"rec": 2}]}
result = json_normalize(
data,
record_path="value",
meta=["meta", ["nested_meta", "leaf"]],
errors="ignore",
)
ex_data = [[1, "foo", np.nan], [2, "foo", np.nan]]
columns = ["rec", "meta", "nested_meta.leaf"]
expected = DataFrame(ex_data, columns=columns).astype(
{"nested_meta.leaf": object}
)
tm.assert_frame_equal(result, expected)
# If errors="raise" and nested metadata is null, we should raise with the
# key of the first missing level
with pytest.raises(KeyError, match="'leaf' not found"):
json_normalize(
data,
record_path="value",
meta=["meta", ["nested_meta", "leaf"]],
errors="raise",
)
def test_missing_meta_multilevel_record_path_errors_raise(self, missing_metadata):
# GH41876
# Ensure errors='raise' works as intended even when a record_path of length
# greater than one is passed in
msg = (
"Key 'name' not found. To replace missing values of "
"'name' with np.nan, pass in errors='ignore'"
)
with pytest.raises(KeyError, match=msg):
json_normalize(
data=missing_metadata,
record_path=["previous_residences", "cities"],
meta="name",
errors="raise",
)
def test_missing_meta_multilevel_record_path_errors_ignore(self, missing_metadata):
# GH41876
# Ensure errors='ignore' works as intended even when a record_path of length
# greater than one is passed in
result = json_normalize(
data=missing_metadata,
record_path=["previous_residences", "cities"],
meta="name",
errors="ignore",
)
ex_data = [
["Foo York City", "Alice"],
["Barmingham", np.nan],
]
columns = ["city_name", "name"]
expected = DataFrame(ex_data, columns=columns)
tm.assert_frame_equal(result, expected)
def test_donot_drop_nonevalues(self):
# GH21356
data = [
{"info": None, "author_name": {"first": "Smith", "last_name": "Appleseed"}},
{
"info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"},
"author_name": {"first": "Jane", "last_name": "Doe"},
},
]
result = nested_to_record(data)
expected = [
{
"info": None,
"author_name.first": "Smith",
"author_name.last_name": "Appleseed",
},
{
"author_name.first": "Jane",
"author_name.last_name": "Doe",
"info.created_at": "11/08/1993",
"info.last_updated": "26/05/2012",
},
]
assert result == expected
def test_nonetype_top_level_bottom_level(self):
# GH21158: If inner level json has a key with a null value
# make sure it does not do a new_d.pop twice and except
data = {
"id": None,
"location": {
"country": {
"state": {
"id": None,
"town.info": {
"id": None,
"region": None,
"x": 49.151580810546875,
"y": -33.148521423339844,
"z": 27.572303771972656,
},
}
}
},
}
result = nested_to_record(data)
expected = {
"id": None,
"location.country.state.id": None,
"location.country.state.town.info.id": None,
"location.country.state.town.info.region": None,
"location.country.state.town.info.x": 49.151580810546875,
"location.country.state.town.info.y": -33.148521423339844,
"location.country.state.town.info.z": 27.572303771972656,
}
assert result == expected
def test_nonetype_multiple_levels(self):
# GH21158: If inner level json has a key with a null value
# make sure it does not do a new_d.pop twice and except
data = {
"id": None,
"location": {
"id": None,
"country": {
"id": None,
"state": {
"id": None,
"town.info": {
"region": None,
"x": 49.151580810546875,
"y": -33.148521423339844,
"z": 27.572303771972656,
},
},
},
},
}
result = nested_to_record(data)
expected = {
"id": None,
"location.id": None,
"location.country.id": None,
"location.country.state.id": None,
"location.country.state.town.info.region": None,
"location.country.state.town.info.x": 49.151580810546875,
"location.country.state.town.info.y": -33.148521423339844,
"location.country.state.town.info.z": 27.572303771972656,
}
assert result == expected
@pytest.mark.parametrize(
"max_level, expected",
[
(
None,
[
{
"CreatedBy.Name": "User001",
"Lookup.TextField": "Some text",
"Lookup.UserField.Id": "ID001",
"Lookup.UserField.Name": "Name001",
"Image.a": "b",
}
],
),
(
0,
[
{
"CreatedBy": {"Name": "User001"},
"Lookup": {
"TextField": "Some text",
"UserField": {"Id": "ID001", "Name": "Name001"},
},
"Image": {"a": "b"},
}
],
),
(
1,
[
{
"CreatedBy.Name": "User001",
"Lookup.TextField": "Some text",
"Lookup.UserField": {"Id": "ID001", "Name": "Name001"},
"Image.a": "b",
}
],
),
],
)
def test_with_max_level(self, max_level, expected, max_level_test_input_data):
# GH23843: Enhanced JSON normalize
output = nested_to_record(max_level_test_input_data, max_level=max_level)
assert output == expected
def test_with_large_max_level(self):
# GH23843: Enhanced JSON normalize
max_level = 100
input_data = [
{
"CreatedBy": {
"user": {
"name": {"firstname": "Leo", "LastName": "Thomson"},
"family_tree": {
"father": {
"name": "Father001",
"father": {
"Name": "Father002",
"father": {
"name": "Father003",
"father": {"Name": "Father004"},
},
},
}
},
}
}
}
]
expected = [
{
"CreatedBy.user.name.firstname": "Leo",
"CreatedBy.user.name.LastName": "Thomson",
"CreatedBy.user.family_tree.father.name": "Father001",
"CreatedBy.user.family_tree.father.father.Name": "Father002",
"CreatedBy.user.family_tree.father.father.father.name": "Father003",
"CreatedBy.user.family_tree.father.father.father.father.Name": "Father004", # noqa: E501
}
]
output = nested_to_record(input_data, max_level=max_level)
assert output == expected
def test_series_non_zero_index(self):
# GH 19020
data = {
0: {"id": 1, "name": "Foo", "elements": {"a": 1}},
1: {"id": 2, "name": "Bar", "elements": {"b": 2}},
2: {"id": 3, "name": "Baz", "elements": {"c": 3}},
}
s = Series(data)
s.index = [1, 2, 3]
result = json_normalize(s)
expected = DataFrame(
{
"id": [1, 2, 3],
"name": ["Foo", "Bar", "Baz"],
"elements.a": [1.0, np.nan, np.nan],
"elements.b": [np.nan, 2.0, np.nan],
"elements.c": [np.nan, np.nan, 3.0],
}
)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,532 @@
from collections.abc import Iterator
from io import StringIO
from pathlib import Path
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
read_json,
)
import pandas._testing as tm
from pandas.io.json._json import JsonReader
@pytest.fixture
def lines_json_df():
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
return df.to_json(lines=True, orient="records")
def test_read_jsonl():
# GH9180
result = read_json(StringIO('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n'), lines=True)
expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
def test_read_jsonl_engine_pyarrow(datapath, engine):
result = read_json(
datapath("io", "json", "data", "line_delimited.json"),
lines=True,
engine=engine,
)
expected = DataFrame({"a": [1, 3, 5], "b": [2, 4, 6]})
tm.assert_frame_equal(result, expected)
def test_read_datetime(request, engine):
# GH33787
if engine == "pyarrow":
# GH 48893
reason = "Pyarrow only supports a file path as an input and line delimited json"
request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError))
df = DataFrame(
[([1, 2], ["2020-03-05", "2020-04-08T09:58:49+00:00"], "hector")],
columns=["accounts", "date", "name"],
)
json_line = df.to_json(lines=True, orient="records")
if engine == "pyarrow":
result = read_json(StringIO(json_line), engine=engine)
else:
result = read_json(StringIO(json_line), engine=engine)
expected = DataFrame(
[[1, "2020-03-05", "hector"], [2, "2020-04-08T09:58:49+00:00", "hector"]],
columns=["accounts", "date", "name"],
)
tm.assert_frame_equal(result, expected)
def test_read_jsonl_unicode_chars():
# GH15132: non-ascii unicode characters
# \u201d == RIGHT DOUBLE QUOTATION MARK
# simulate file handle
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
json = StringIO(json)
result = read_json(json, lines=True)
expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
# simulate string
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
result = read_json(StringIO(json), lines=True)
expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
def test_to_jsonl():
# GH9180
df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
result = df.to_json(orient="records", lines=True)
expected = '{"a":1,"b":2}\n{"a":1,"b":2}\n'
assert result == expected
df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=["a", "b"])
result = df.to_json(orient="records", lines=True)
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n'
assert result == expected
tm.assert_frame_equal(read_json(StringIO(result), lines=True), df)
# GH15096: escaped characters in columns and data
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"])
result = df.to_json(orient="records", lines=True)
expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n'
assert result == expected
tm.assert_frame_equal(read_json(StringIO(result), lines=True), df)
def test_to_jsonl_count_new_lines():
# GH36888
df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
actual_new_lines_count = df.to_json(orient="records", lines=True).count("\n")
expected_new_lines_count = 2
assert actual_new_lines_count == expected_new_lines_count
@pytest.mark.parametrize("chunksize", [1, 1.0])
def test_readjson_chunks(request, lines_json_df, chunksize, engine):
# Basic test that read_json(chunks=True) gives the same result as
# read_json(chunks=False)
# GH17048: memory usage when lines=True
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError))
unchunked = read_json(StringIO(lines_json_df), lines=True)
with read_json(
StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine
) as reader:
chunked = pd.concat(reader)
tm.assert_frame_equal(chunked, unchunked)
def test_readjson_chunksize_requires_lines(lines_json_df, engine):
msg = "chunksize can only be passed if lines=True"
with pytest.raises(ValueError, match=msg):
with read_json(
StringIO(lines_json_df), lines=False, chunksize=2, engine=engine
) as _:
pass
def test_readjson_chunks_series(request, engine):
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.node.add_marker(pytest.mark.xfail(reason=reason))
# Test reading line-format JSON to Series with chunksize param
s = pd.Series({"A": 1, "B": 2})
strio = StringIO(s.to_json(lines=True, orient="records"))
unchunked = read_json(strio, lines=True, typ="Series", engine=engine)
strio = StringIO(s.to_json(lines=True, orient="records"))
with read_json(
strio, lines=True, typ="Series", chunksize=1, engine=engine
) as reader:
chunked = pd.concat(reader)
tm.assert_series_equal(chunked, unchunked)
def test_readjson_each_chunk(request, lines_json_df, engine):
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError))
# Other tests check that the final result of read_json(chunksize=True)
# is correct. This checks the intermediate chunks.
with read_json(
StringIO(lines_json_df), lines=True, chunksize=2, engine=engine
) as reader:
chunks = list(reader)
assert chunks[0].shape == (2, 2)
assert chunks[1].shape == (1, 2)
def test_readjson_chunks_from_file(request, engine):
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError))
with tm.ensure_clean("test.json") as path:
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df.to_json(path, lines=True, orient="records")
with read_json(path, lines=True, chunksize=1, engine=engine) as reader:
chunked = pd.concat(reader)
unchunked = read_json(path, lines=True, engine=engine)
tm.assert_frame_equal(unchunked, chunked)
@pytest.mark.parametrize("chunksize", [None, 1])
def test_readjson_chunks_closes(chunksize):
with tm.ensure_clean("test.json") as path:
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df.to_json(path, lines=True, orient="records")
reader = JsonReader(
path,
orient=None,
typ="frame",
dtype=True,
convert_axes=True,
convert_dates=True,
keep_default_dates=True,
precise_float=False,
date_unit=None,
encoding=None,
lines=True,
chunksize=chunksize,
compression=None,
nrows=None,
)
with reader:
reader.read()
assert (
reader.handles.handle.closed
), f"didn't close stream with chunksize = {chunksize}"
@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
def test_readjson_invalid_chunksize(lines_json_df, chunksize, engine):
msg = r"'chunksize' must be an integer >=1"
with pytest.raises(ValueError, match=msg):
with read_json(
StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine
) as _:
pass
@pytest.mark.parametrize("chunksize", [None, 1, 2])
def test_readjson_chunks_multiple_empty_lines(chunksize):
j = """
{"A":1,"B":4}
{"A":2,"B":5}
{"A":3,"B":6}
"""
orig = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
test = read_json(StringIO(j), lines=True, chunksize=chunksize)
if chunksize is not None:
with test:
test = pd.concat(test)
tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}")
def test_readjson_unicode(request, monkeypatch, engine):
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError))
with tm.ensure_clean("test.json") as path:
monkeypatch.setattr("locale.getpreferredencoding", lambda do_setlocale: "cp949")
with open(path, "w", encoding="utf-8") as f:
f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}')
result = read_json(path, engine=engine)
expected = DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("nrows", [1, 2])
def test_readjson_nrows(nrows, engine):
# GH 33916
# Test reading line-format JSON to Series with nrows param
jsonl = """{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}"""
result = read_json(StringIO(jsonl), lines=True, nrows=nrows)
expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)])
def test_readjson_nrows_chunks(request, nrows, chunksize, engine):
# GH 33916
# Test reading line-format JSON to Series with nrows and chunksize param
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError))
jsonl = """{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}"""
if engine != "pyarrow":
with read_json(
StringIO(jsonl), lines=True, nrows=nrows, chunksize=chunksize, engine=engine
) as reader:
chunked = pd.concat(reader)
else:
with read_json(
jsonl, lines=True, nrows=nrows, chunksize=chunksize, engine=engine
) as reader:
chunked = pd.concat(reader)
expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
tm.assert_frame_equal(chunked, expected)
def test_readjson_nrows_requires_lines(engine):
# GH 33916
# Test ValueError raised if nrows is set without setting lines in read_json
jsonl = """{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}"""
msg = "nrows can only be passed if lines=True"
with pytest.raises(ValueError, match=msg):
read_json(jsonl, lines=False, nrows=2, engine=engine)
def test_readjson_lines_chunks_fileurl(request, datapath, engine):
# GH 27135
# Test reading line-format JSON from file url
if engine == "pyarrow":
# GH 48893
reason = (
"Pyarrow only supports a file path as an input and line delimited json"
"and doesn't support chunksize parameter."
)
request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError))
df_list_expected = [
DataFrame([[1, 2]], columns=["a", "b"], index=[0]),
DataFrame([[3, 4]], columns=["a", "b"], index=[1]),
DataFrame([[5, 6]], columns=["a", "b"], index=[2]),
]
os_path = datapath("io", "json", "data", "line_delimited.json")
file_url = Path(os_path).as_uri()
with read_json(file_url, lines=True, chunksize=1, engine=engine) as url_reader:
for index, chuck in enumerate(url_reader):
tm.assert_frame_equal(chuck, df_list_expected[index])
def test_chunksize_is_incremental():
# See https://github.com/pandas-dev/pandas/issues/34548
jsonl = (
"""{"a": 1, "b": 2}
{"a": 3, "b": 4}
{"a": 5, "b": 6}
{"a": 7, "b": 8}\n"""
* 1000
)
class MyReader:
def __init__(self, contents) -> None:
self.read_count = 0
self.stringio = StringIO(contents)
def read(self, *args):
self.read_count += 1
return self.stringio.read(*args)
def __iter__(self) -> Iterator:
self.read_count += 1
return iter(self.stringio)
reader = MyReader(jsonl)
assert len(list(read_json(reader, lines=True, chunksize=100))) > 1
assert reader.read_count > 10
@pytest.mark.parametrize("orient_", ["split", "index", "table"])
def test_to_json_append_orient(orient_):
# GH 35849
# Test ValueError when orient is not 'records'
df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
msg = (
r"mode='a' \(append\) is only supported when"
"lines is True and orient is 'records'"
)
with pytest.raises(ValueError, match=msg):
df.to_json(mode="a", orient=orient_)
def test_to_json_append_lines():
# GH 35849
# Test ValueError when lines is not True
df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
msg = (
r"mode='a' \(append\) is only supported when"
"lines is True and orient is 'records'"
)
with pytest.raises(ValueError, match=msg):
df.to_json(mode="a", lines=False, orient="records")
@pytest.mark.parametrize("mode_", ["r", "x"])
def test_to_json_append_mode(mode_):
# GH 35849
# Test ValueError when mode is not supported option
df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
msg = (
f"mode={mode_} is not a valid option."
"Only 'w' and 'a' are currently supported."
)
with pytest.raises(ValueError, match=msg):
df.to_json(mode=mode_, lines=False, orient="records")
def test_to_json_append_output_consistent_columns():
# GH 35849
# Testing that resulting output reads in as expected.
# Testing same columns, new rows
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
expected = DataFrame({"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"]})
with tm.ensure_clean("test.json") as path:
# Save dataframes to the same file
df1.to_json(path, lines=True, orient="records")
df2.to_json(path, mode="a", lines=True, orient="records")
# Read path file
result = read_json(path, lines=True)
tm.assert_frame_equal(result, expected)
def test_to_json_append_output_inconsistent_columns():
# GH 35849
# Testing that resulting output reads in as expected.
# Testing one new column, one old column, new rows
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
expected = DataFrame(
{
"col1": [1, 2, None, None],
"col2": ["a", "b", "e", "f"],
"col3": [np.nan, np.nan, "!", "#"],
}
)
with tm.ensure_clean("test.json") as path:
# Save dataframes to the same file
df1.to_json(path, mode="a", lines=True, orient="records")
df3.to_json(path, mode="a", lines=True, orient="records")
# Read path file
result = read_json(path, lines=True)
tm.assert_frame_equal(result, expected)
def test_to_json_append_output_different_columns():
# GH 35849
# Testing that resulting output reads in as expected.
# Testing same, differing and new columns
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
df4 = DataFrame({"col4": [True, False]})
expected = DataFrame(
{
"col1": [1, 2, 3, 4, None, None, None, None],
"col2": ["a", "b", "c", "d", "e", "f", np.nan, np.nan],
"col3": [np.nan, np.nan, np.nan, np.nan, "!", "#", np.nan, np.nan],
"col4": [None, None, None, None, None, None, True, False],
}
).astype({"col4": "float"})
with tm.ensure_clean("test.json") as path:
# Save dataframes to the same file
df1.to_json(path, mode="a", lines=True, orient="records")
df2.to_json(path, mode="a", lines=True, orient="records")
df3.to_json(path, mode="a", lines=True, orient="records")
df4.to_json(path, mode="a", lines=True, orient="records")
# Read path file
result = read_json(path, lines=True)
tm.assert_frame_equal(result, expected)
def test_to_json_append_output_different_columns_reordered():
# GH 35849
# Testing that resulting output reads in as expected.
# Testing specific result column order.
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
df4 = DataFrame({"col4": [True, False]})
# df4, df3, df2, df1 (in that order)
expected = DataFrame(
{
"col4": [True, False, None, None, None, None, None, None],
"col2": [np.nan, np.nan, "e", "f", "c", "d", "a", "b"],
"col3": [np.nan, np.nan, "!", "#", np.nan, np.nan, np.nan, np.nan],
"col1": [None, None, None, None, 3, 4, 1, 2],
}
).astype({"col4": "float"})
with tm.ensure_clean("test.json") as path:
# Save dataframes to the same file
df4.to_json(path, mode="a", lines=True, orient="records")
df3.to_json(path, mode="a", lines=True, orient="records")
df2.to_json(path, mode="a", lines=True, orient="records")
df1.to_json(path, mode="a", lines=True, orient="records")
# Read path file
result = read_json(path, lines=True)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,287 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import numpy as np
import pytest
from pandas._libs import parsers as libparsers
from pandas.errors import DtypeWarning
from pandas import (
DataFrame,
concat,
)
import pandas._testing as tm
pytestmark = pytest.mark.usefixtures("pyarrow_skip")
@pytest.mark.parametrize("index_col", [0, "index"])
def test_read_chunksize_with_index(all_parsers, index_col):
parser = all_parsers
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
expected = DataFrame(
[
["foo", 2, 3, 4, 5],
["bar", 7, 8, 9, 10],
["baz", 12, 13, 14, 15],
["qux", 12, 13, 14, 15],
["foo2", 12, 13, 14, 15],
["bar2", 12, 13, 14, 15],
],
columns=["index", "A", "B", "C", "D"],
)
expected = expected.set_index("index")
with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
chunks = list(reader)
tm.assert_frame_equal(chunks[0], expected[:2])
tm.assert_frame_equal(chunks[1], expected[2:4])
tm.assert_frame_equal(chunks[2], expected[4:])
@pytest.mark.parametrize("chunksize", [1.3, "foo", 0])
def test_read_chunksize_bad(all_parsers, chunksize):
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
msg = r"'chunksize' must be an integer >=1"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(StringIO(data), chunksize=chunksize) as _:
pass
@pytest.mark.parametrize("chunksize", [2, 8])
def test_read_chunksize_and_nrows(all_parsers, chunksize):
# see gh-15755
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
kwargs = {"index_col": 0, "nrows": 5}
expected = parser.read_csv(StringIO(data), **kwargs)
with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader:
tm.assert_frame_equal(concat(reader), expected)
def test_read_chunksize_and_nrows_changing_size(all_parsers):
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
kwargs = {"index_col": 0, "nrows": 5}
expected = parser.read_csv(StringIO(data), **kwargs)
with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader:
tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2])
tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5])
with pytest.raises(StopIteration, match=""):
reader.get_chunk(size=3)
def test_get_chunk_passed_chunksize(all_parsers):
parser = all_parsers
data = """A,B,C
1,2,3
4,5,6
7,8,9
1,2,3"""
with parser.read_csv(StringIO(data), chunksize=2) as reader:
result = reader.get_chunk()
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}])
def test_read_chunksize_compat(all_parsers, kwargs):
# see gh-12185
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), **kwargs)
with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
tm.assert_frame_equal(concat(reader), result)
def test_read_chunksize_jagged_names(all_parsers):
# see gh-23509
parser = all_parsers
data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])
with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader:
result = concat(reader)
tm.assert_frame_equal(result, expected)
def test_chunk_begins_with_newline_whitespace(all_parsers):
# see gh-10022
parser = all_parsers
data = "\n hello\nworld\n"
result = parser.read_csv(StringIO(data), header=None)
expected = DataFrame([" hello", "world"])
tm.assert_frame_equal(result, expected)
@pytest.mark.slow
def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
# mainly an issue with the C parser
heuristic = 2**3
parser = all_parsers
integers = [str(i) for i in range(heuristic - 1)]
data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
# Coercions should work without warnings.
with tm.assert_produces_warning(None):
with monkeypatch.context() as m:
m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic)
result = parser.read_csv(StringIO(data))
assert type(result.a[0]) is np.float64
assert result.a.dtype == float
def test_warn_if_chunks_have_mismatched_type(all_parsers):
warning_type = None
parser = all_parsers
size = 10000
# see gh-3866: if chunks are different types and can't
# be coerced using numerical types, then issue warning.
if parser.engine == "c" and parser.low_memory:
warning_type = DtypeWarning
# Use larger size to hit warning path
size = 499999
integers = [str(i) for i in range(size)]
data = "a\n" + "\n".join(integers + ["a", "b"] + integers)
buf = StringIO(data)
df = parser.read_csv_check_warnings(
warning_type,
r"Columns \(0\) have mixed types. "
"Specify dtype option on import or set low_memory=False.",
buf,
)
assert df.a.dtype == object
@pytest.mark.parametrize("iterator", [True, False])
def test_empty_with_nrows_chunksize(all_parsers, iterator):
# see gh-9535
parser = all_parsers
expected = DataFrame(columns=["foo", "bar"])
nrows = 10
data = StringIO("foo,bar\n")
if iterator:
with parser.read_csv(data, chunksize=nrows) as reader:
result = next(iter(reader))
else:
result = parser.read_csv(data, nrows=nrows)
tm.assert_frame_equal(result, expected)
def test_read_csv_memory_growth_chunksize(all_parsers):
# see gh-24805
#
# Let's just make sure that we don't crash
# as we iteratively process all chunks.
parser = all_parsers
with tm.ensure_clean() as path:
with open(path, "w", encoding="utf-8") as f:
for i in range(1000):
f.write(str(i) + "\n")
with parser.read_csv(path, chunksize=20) as result:
for _ in result:
pass
def test_chunksize_with_usecols_second_block_shorter(all_parsers):
# GH#21211
parser = all_parsers
data = """1,2,3,4
5,6,7,8
9,10,11
"""
result_chunks = parser.read_csv(
StringIO(data),
names=["a", "b"],
chunksize=2,
usecols=[0, 1],
header=None,
)
expected_frames = [
DataFrame({"a": [1, 5], "b": [2, 6]}),
DataFrame({"a": [9], "b": [10]}, index=[2]),
]
for i, result in enumerate(result_chunks):
tm.assert_frame_equal(result, expected_frames[i])
def test_chunksize_second_block_shorter(all_parsers):
# GH#21211
parser = all_parsers
data = """a,b,c,d
1,2,3,4
5,6,7,8
9,10,11
"""
result_chunks = parser.read_csv(StringIO(data), chunksize=2)
expected_frames = [
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
DataFrame({"a": [9], "b": [10], "c": [11], "d": [np.nan]}, index=[2]),
]
for i, result in enumerate(result_chunks):
tm.assert_frame_equal(result, expected_frames[i])

View File

@@ -0,0 +1,868 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from datetime import datetime
from inspect import signature
from io import StringIO
import os
from pathlib import Path
import sys
import numpy as np
import pytest
from pandas.errors import (
EmptyDataError,
ParserError,
ParserWarning,
)
from pandas import (
DataFrame,
Index,
Timestamp,
compat,
)
import pandas._testing as tm
from pandas.io.parsers import TextFileReader
from pandas.io.parsers.c_parser_wrapper import CParserWrapper
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
def test_override_set_noconvert_columns():
# see gh-17351
#
# Usecols needs to be sorted in _set_noconvert_columns based
# on the test_usecols_with_parse_dates test from test_usecols.py
class MyTextFileReader(TextFileReader):
def __init__(self) -> None:
self._currow = 0
self.squeeze = False
class MyCParserWrapper(CParserWrapper):
def _set_noconvert_columns(self):
if self.usecols_dtype == "integer":
# self.usecols is a set, which is documented as unordered
# but in practice, a CPython set of integers is sorted.
# In other implementations this assumption does not hold.
# The following code simulates a different order, which
# before GH 17351 would cause the wrong columns to be
# converted via the parse_dates parameter
self.usecols = list(self.usecols)
self.usecols.reverse()
return CParserWrapper._set_noconvert_columns(self)
data = """a,b,c,d,e
0,1,2014-01-01,09:00,4
0,1,2014-01-02,10:00,4"""
parse_dates = [[1, 2]]
cols = {
"a": [0, 0],
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
}
expected = DataFrame(cols, columns=["c_d", "a"])
parser = MyTextFileReader()
parser.options = {
"usecols": [0, 2, 3],
"parse_dates": parse_dates,
"delimiter": ",",
}
parser.engine = "c"
parser._engine = MyCParserWrapper(StringIO(data), **parser.options)
result = parser.read()
tm.assert_frame_equal(result, expected)
def test_read_csv_local(all_parsers, csv1):
prefix = "file:///" if compat.is_platform_windows() else "file://"
parser = all_parsers
fname = prefix + str(os.path.abspath(csv1))
result = parser.read_csv(fname, index_col=0, parse_dates=True)
# TODO: make unit check more specific
if parser.engine == "pyarrow":
result.index = result.index.as_unit("ns")
expected = DataFrame(
[
[0.980269, 3.685731, -0.364216805298, -1.159738],
[1.047916, -0.041232, -0.16181208307, 0.212549],
[0.498581, 0.731168, -0.537677223318, 1.346270],
[1.120202, 1.567621, 0.00364077397681, 0.675253],
[-0.487094, 0.571455, -1.6116394093, 0.103469],
[0.836649, 0.246462, 0.588542635376, 1.062782],
[-0.157161, 1.340307, 1.1957779562, -1.097007],
],
columns=["A", "B", "C", "D"],
index=Index(
[
datetime(2000, 1, 3),
datetime(2000, 1, 4),
datetime(2000, 1, 5),
datetime(2000, 1, 6),
datetime(2000, 1, 7),
datetime(2000, 1, 10),
datetime(2000, 1, 11),
],
name="index",
),
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
def test_1000_sep(all_parsers):
parser = all_parsers
data = """A|B|C
1|2,334|5
10|13|10.
"""
expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]})
result = parser.read_csv(StringIO(data), sep="|", thousands=",")
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
def test_unnamed_columns(all_parsers):
data = """A,B,C,,
1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""
parser = all_parsers
expected = DataFrame(
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]],
dtype=np.int64,
columns=["A", "B", "C", "Unnamed: 3", "Unnamed: 4"],
)
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
def test_csv_mixed_type(all_parsers):
data = """A,B,C
a,1,2
b,3,4
c,4,5
"""
parser = all_parsers
expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]})
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
def test_read_csv_low_memory_no_rows_with_index(all_parsers):
# see gh-21141
parser = all_parsers
if not parser.low_memory:
pytest.skip("This is a low-memory specific test")
data = """A,B,C
1,1,1,2
2,2,3,4
3,3,4,5
"""
result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
expected = DataFrame(columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)
def test_read_csv_dataframe(all_parsers, csv1):
parser = all_parsers
result = parser.read_csv(csv1, index_col=0, parse_dates=True)
# TODO: make unit check more specific
if parser.engine == "pyarrow":
result.index = result.index.as_unit("ns")
expected = DataFrame(
[
[0.980269, 3.685731, -0.364216805298, -1.159738],
[1.047916, -0.041232, -0.16181208307, 0.212549],
[0.498581, 0.731168, -0.537677223318, 1.346270],
[1.120202, 1.567621, 0.00364077397681, 0.675253],
[-0.487094, 0.571455, -1.6116394093, 0.103469],
[0.836649, 0.246462, 0.588542635376, 1.062782],
[-0.157161, 1.340307, 1.1957779562, -1.097007],
],
columns=["A", "B", "C", "D"],
index=Index(
[
datetime(2000, 1, 3),
datetime(2000, 1, 4),
datetime(2000, 1, 5),
datetime(2000, 1, 6),
datetime(2000, 1, 7),
datetime(2000, 1, 10),
datetime(2000, 1, 11),
],
name="index",
),
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
@pytest.mark.parametrize("nrows", [3, 3.0])
def test_read_nrows(all_parsers, nrows):
# see gh-10476
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
expected = DataFrame(
[["foo", 2, 3, 4, 5], ["bar", 7, 8, 9, 10], ["baz", 12, 13, 14, 15]],
columns=["index", "A", "B", "C", "D"],
)
parser = all_parsers
result = parser.read_csv(StringIO(data), nrows=nrows)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
@pytest.mark.parametrize("nrows", [1.2, "foo", -1])
def test_read_nrows_bad(all_parsers, nrows):
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
msg = r"'nrows' must be an integer >=0"
parser = all_parsers
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), nrows=nrows)
def test_nrows_skipfooter_errors(all_parsers):
msg = "'skipfooter' not supported with 'nrows'"
data = "a\n1\n2\n3\n4\n5\n6"
parser = all_parsers
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), skipfooter=1, nrows=5)
@xfail_pyarrow
def test_missing_trailing_delimiters(all_parsers):
parser = all_parsers
data = """A,B,C,D
1,2,3,4
1,3,3,
1,4,5"""
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[[1, 2, 3, 4], [1, 3, 3, np.nan], [1, 4, 5, np.nan]],
columns=["A", "B", "C", "D"],
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
def test_skip_initial_space(all_parsers):
data = (
'"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
"1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, "
"314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, "
"70.06056, 344.98370, 1, 1, -0.689265, -0.692787, "
"0.212036, 14.7674, 41.605, -9999.0, -9999.0, "
"-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128"
)
parser = all_parsers
result = parser.read_csv(
StringIO(data),
names=list(range(33)),
header=None,
na_values=["-9999.0"],
skipinitialspace=True,
)
expected = DataFrame(
[
[
"09-Apr-2012",
"01:10:18.300",
2456026.548822908,
12849,
1.00361,
1.12551,
330.65659,
355626618.16711,
73.48821,
314.11625,
1917.09447,
179.71425,
80.0,
240.0,
-350,
70.06056,
344.9837,
1,
1,
-0.689265,
-0.692787,
0.212036,
14.7674,
41.605,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
0,
12,
128,
]
]
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
def test_trailing_delimiters(all_parsers):
# see gh-2442
data = """A,B,C
1,2,3,
4,5,6,
7,8,9,"""
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=False)
expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]})
tm.assert_frame_equal(result, expected)
def test_escapechar(all_parsers):
# https://stackoverflow.com/questions/13824840/feature-request-for-
# pandas-read-csv
data = '''SEARCH_TERM,ACTUAL_URL
"bra tv board","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa: E501
parser = all_parsers
result = parser.read_csv(
StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8"
)
assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals series'
tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"]))
@xfail_pyarrow
def test_ignore_leading_whitespace(all_parsers):
# see gh-3374, gh-6607
parser = all_parsers
data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9"
result = parser.read_csv(StringIO(data), sep=r"\s+")
expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]})
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
@pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]])
def test_uneven_lines_with_usecols(all_parsers, usecols):
# see gh-12203
parser = all_parsers
data = r"""a,b,c
0,1,2
3,4,5,6,7
8,9,10"""
if usecols is None:
# Make sure that an error is still raised
# when the "usecols" parameter is not provided.
msg = r"Expected \d+ fields in line \d+, saw \d+"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data))
else:
expected = DataFrame({"a": [0, 3, 8], "b": [1, 4, 9]})
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
@pytest.mark.parametrize(
"data,kwargs,expected",
[
# First, check to see that the response of parser when faced with no
# provided columns raises the correct error, with or without usecols.
("", {}, None),
("", {"usecols": ["X"]}, None),
(
",,",
{"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]},
DataFrame(columns=["X"], index=[0], dtype=np.float64),
),
(
"",
{"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]},
DataFrame(columns=["X"]),
),
],
)
def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
# see gh-12493
parser = all_parsers
if expected is None:
msg = "No columns to parse from file"
with pytest.raises(EmptyDataError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
else:
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
@pytest.mark.parametrize(
"kwargs,expected",
[
# gh-8661, gh-8679: this should ignore six lines, including
# lines with trailing whitespace and blank lines.
(
{
"header": None,
"delim_whitespace": True,
"skiprows": [0, 1, 2, 3, 5, 6],
"skip_blank_lines": True,
},
DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]),
),
# gh-8983: test skipping set of rows after a row with trailing spaces.
(
{
"delim_whitespace": True,
"skiprows": [1, 2, 3, 5, 6],
"skip_blank_lines": True,
},
DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}),
),
],
)
def test_trailing_spaces(all_parsers, kwargs, expected):
data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501
parser = all_parsers
result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
tm.assert_frame_equal(result, expected)
def test_raise_on_sep_with_delim_whitespace(all_parsers):
# see gh-6607
data = "a b c\n1 2 3"
parser = all_parsers
with pytest.raises(ValueError, match="you can only specify one"):
parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
def test_read_filepath_or_buffer(all_parsers):
# see gh-43366
parser = all_parsers
with pytest.raises(TypeError, match="Expected file path name or file-like"):
parser.read_csv(filepath_or_buffer=b"input")
@xfail_pyarrow
@pytest.mark.parametrize("delim_whitespace", [True, False])
def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
# see gh-9710
parser = all_parsers
data = """\
MyColumn
a
b
a
b\n"""
expected = DataFrame({"MyColumn": list("abab")})
result = parser.read_csv(
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
)
tm.assert_frame_equal(result, expected)
# Skip for now, actually only one test fails though, but its tricky to xfail
@skip_pyarrow
@pytest.mark.parametrize(
"sep,skip_blank_lines,exp_data",
[
(",", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
(r"\s+", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
(
",",
False,
[
[1.0, 2.0, 4.0],
[np.nan, np.nan, np.nan],
[np.nan, np.nan, np.nan],
[5.0, np.nan, 10.0],
[np.nan, np.nan, np.nan],
[-70.0, 0.4, 1.0],
],
),
],
)
def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data):
parser = all_parsers
data = """\
A,B,C
1,2.,4.
5.,NaN,10.0
-70,.4,1
"""
if sep == r"\s+":
data = data.replace(",", " ")
result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines)
expected = DataFrame(exp_data, columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
def test_whitespace_lines(all_parsers):
parser = all_parsers
data = """
\t \t\t
\t
A,B,C
\t 1,2.,4.
5.,NaN,10.0
"""
expected = DataFrame([[1, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"])
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
@pytest.mark.parametrize(
"data,expected",
[
(
""" A B C D
a 1 2 3 4
b 1 2 3 4
c 1 2 3 4
""",
DataFrame(
[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
columns=["A", "B", "C", "D"],
index=["a", "b", "c"],
),
),
(
" a b c\n1 2 3 \n4 5 6\n 7 8 9",
DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]),
),
],
)
def test_whitespace_regex_separator(all_parsers, data, expected):
# see gh-6607
parser = all_parsers
result = parser.read_csv(StringIO(data), sep=r"\s+")
tm.assert_frame_equal(result, expected)
def test_sub_character(all_parsers, csv_dir_path):
# see gh-16893
filename = os.path.join(csv_dir_path, "sub_char.csv")
expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"])
parser = all_parsers
result = parser.read_csv(filename)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv", "中文文件名.csv"])
def test_filename_with_special_chars(all_parsers, filename):
# see gh-15086.
parser = all_parsers
df = DataFrame({"a": [1, 2, 3]})
with tm.ensure_clean(filename) as path:
df.to_csv(path, index=False)
result = parser.read_csv(path)
tm.assert_frame_equal(result, df)
def test_read_table_same_signature_as_read_csv(all_parsers):
# GH-34976
parser = all_parsers
table_sign = signature(parser.read_table)
csv_sign = signature(parser.read_csv)
assert table_sign.parameters.keys() == csv_sign.parameters.keys()
assert table_sign.return_annotation == csv_sign.return_annotation
for key, csv_param in csv_sign.parameters.items():
table_param = table_sign.parameters[key]
if key == "sep":
assert csv_param.default == ","
assert table_param.default == "\t"
assert table_param.annotation == csv_param.annotation
assert table_param.kind == csv_param.kind
continue
assert table_param == csv_param
def test_read_table_equivalency_to_read_csv(all_parsers):
# see gh-21948
# As of 0.25.0, read_table is undeprecated
parser = all_parsers
data = "a\tb\n1\t2\n3\t4"
expected = parser.read_csv(StringIO(data), sep="\t")
result = parser.read_table(StringIO(data))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("read_func", ["read_csv", "read_table"])
def test_read_csv_and_table_sys_setprofile(all_parsers, read_func):
# GH#41069
parser = all_parsers
data = "a b\n0 1"
sys.setprofile(lambda *a, **k: None)
result = getattr(parser, read_func)(StringIO(data))
sys.setprofile(None)
expected = DataFrame({"a b": ["0 1"]})
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
def test_first_row_bom(all_parsers):
# see gh-26545
parser = all_parsers
data = '''\ufeff"Head1"\t"Head2"\t"Head3"'''
result = parser.read_csv(StringIO(data), delimiter="\t")
expected = DataFrame(columns=["Head1", "Head2", "Head3"])
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
def test_first_row_bom_unquoted(all_parsers):
# see gh-36343
parser = all_parsers
data = """\ufeffHead1\tHead2\tHead3"""
result = parser.read_csv(StringIO(data), delimiter="\t")
expected = DataFrame(columns=["Head1", "Head2", "Head3"])
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
@pytest.mark.parametrize("nrows", range(1, 6))
def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
# GH 28071
ref = DataFrame(
[[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]],
columns=list("ab"),
)
csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
parser = all_parsers
df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
tm.assert_frame_equal(df, ref[:nrows])
@xfail_pyarrow
def test_no_header_two_extra_columns(all_parsers):
# GH 26218
column_names = ["one", "two", "three"]
ref = DataFrame([["foo", "bar", "baz"]], columns=column_names)
stream = StringIO("foo,bar,baz,bam,blah")
parser = all_parsers
df = parser.read_csv_check_warnings(
ParserWarning,
"Length of header or names does not match length of data. "
"This leads to a loss of data with index_col=False.",
stream,
header=None,
names=column_names,
index_col=False,
)
tm.assert_frame_equal(df, ref)
def test_read_csv_names_not_accepting_sets(all_parsers):
# GH 34946
data = """\
1,2,3
4,5,6\n"""
parser = all_parsers
with pytest.raises(ValueError, match="Names should be an ordered collection."):
parser.read_csv(StringIO(data), names=set("QAZ"))
@xfail_pyarrow
def test_read_table_delim_whitespace_default_sep(all_parsers):
# GH: 35958
f = StringIO("a b c\n1 -2 -3\n4 5 6")
parser = all_parsers
result = parser.read_table(f, delim_whitespace=True)
expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("delimiter", [",", "\t"])
def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter):
# GH: 35958
f = StringIO("a b c\n1 -2 -3\n4 5 6")
parser = all_parsers
msg = (
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(f, delim_whitespace=True, sep=delimiter)
with pytest.raises(ValueError, match=msg):
parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)
def test_read_csv_delimiter_and_sep_no_default(all_parsers):
# GH#39823
f = StringIO("a,b\n1,2")
parser = all_parsers
msg = "Specified a sep and a delimiter; you can only specify one."
with pytest.raises(ValueError, match=msg):
parser.read_csv(f, sep=" ", delimiter=".")
@pytest.mark.parametrize("kwargs", [{"delimiter": "\n"}, {"sep": "\n"}])
def test_read_csv_line_break_as_separator(kwargs, all_parsers):
# GH#43528
parser = all_parsers
data = """a,b,c
1,2,3
"""
msg = (
r"Specified \\n as separator or delimiter. This forces the python engine "
r"which does not accept a line terminator. Hence it is not allowed to use "
r"the line terminator as separator."
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
@pytest.mark.parametrize("delimiter", [",", "\t"])
def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
# GH: 35958
f = StringIO("a b c\n1 -2 -3\n4 5 6")
parser = all_parsers
msg = (
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, sep=delimiter)
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, delimiter=delimiter)
@xfail_pyarrow
def test_dict_keys_as_names(all_parsers):
# GH: 36928
data = "1,2"
keys = {"a": int, "b": int}.keys()
parser = all_parsers
result = parser.read_csv(StringIO(data), names=keys)
expected = DataFrame({"a": [1], "b": [2]})
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
def test_encoding_surrogatepass(all_parsers):
# GH39017
parser = all_parsers
content = b"\xed\xbd\xbf"
decoded = content.decode("utf-8", errors="surrogatepass")
expected = DataFrame({decoded: [decoded]}, index=[decoded * 2])
expected.index.name = decoded * 2
with tm.ensure_clean() as path:
Path(path).write_bytes(
content * 2 + b"," + content + b"\n" + content * 2 + b"," + content
)
df = parser.read_csv(path, encoding_errors="surrogatepass", index_col=0)
tm.assert_frame_equal(df, expected)
with pytest.raises(UnicodeDecodeError, match="'utf-8' codec can't decode byte"):
parser.read_csv(path)
def test_malformed_second_line(all_parsers):
# see GH14782
parser = all_parsers
data = "\na\nb\n"
result = parser.read_csv(StringIO(data), skip_blank_lines=False, header=1)
expected = DataFrame({"a": ["b"]})
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
def test_short_single_line(all_parsers):
# GH 47566
parser = all_parsers
columns = ["a", "b", "c"]
data = "1,2"
result = parser.read_csv(StringIO(data), header=None, names=columns)
expected = DataFrame({"a": [1], "b": [2], "c": [np.nan]})
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
def test_short_multi_line(all_parsers):
# GH 47566
parser = all_parsers
columns = ["a", "b", "c"]
data = "1,2\n1,2"
result = parser.read_csv(StringIO(data), header=None, names=columns)
expected = DataFrame({"a": [1, 1], "b": [2, 2], "c": [np.nan, np.nan]})
tm.assert_frame_equal(result, expected)
def test_read_seek(all_parsers):
# GH48646
parser = all_parsers
prefix = "### DATA\n"
content = "nkey,value\ntables,rectangular\n"
with tm.ensure_clean() as path:
Path(path).write_text(prefix + content, encoding="utf-8")
with open(path, encoding="utf-8") as file:
file.readline()
actual = parser.read_csv(file)
expected = parser.read_csv(StringIO(content))
tm.assert_frame_equal(actual, expected)

View File

@@ -0,0 +1,87 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
import csv
from io import StringIO
import pytest
from pandas import DataFrame
import pandas._testing as tm
from pandas.io.parsers import TextParser
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
@xfail_pyarrow
def test_read_data_list(all_parsers):
parser = all_parsers
kwargs = {"index_col": 0}
data = "A,B,C\nfoo,1,2,3\nbar,4,5,6"
data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]]
expected = parser.read_csv(StringIO(data), **kwargs)
with TextParser(data_list, chunksize=2, **kwargs) as parser:
result = parser.read()
tm.assert_frame_equal(result, expected)
def test_reader_list(all_parsers):
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
kwargs = {"index_col": 0}
lines = list(csv.reader(StringIO(data)))
with TextParser(lines, chunksize=2, **kwargs) as reader:
chunks = list(reader)
expected = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(chunks[0], expected[:2])
tm.assert_frame_equal(chunks[1], expected[2:4])
tm.assert_frame_equal(chunks[2], expected[4:])
def test_reader_list_skiprows(all_parsers):
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
kwargs = {"index_col": 0}
lines = list(csv.reader(StringIO(data)))
with TextParser(lines, chunksize=2, skiprows=[1], **kwargs) as reader:
chunks = list(reader)
expected = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(chunks[0], expected[1:3])
def test_read_csv_parse_simple_list(all_parsers):
parser = all_parsers
data = """foo
bar baz
qux foo
foo
bar"""
result = parser.read_csv(StringIO(data), header=None)
expected = DataFrame(["foo", "bar baz", "qux foo", "foo", "bar"])
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,63 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import pytest
from pandas import DataFrame
import pandas._testing as tm
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
@xfail_pyarrow
@pytest.mark.parametrize(
"data,thousands,decimal",
[
(
"""A|B|C
1|2,334.01|5
10|13|10.
""",
",",
".",
),
(
"""A|B|C
1|2.334,01|5
10|13|10,
""",
".",
",",
),
],
)
def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal):
parser = all_parsers
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
result = parser.read_csv(
StringIO(data), sep="|", thousands=thousands, decimal=decimal
)
tm.assert_frame_equal(result, expected)
def test_euro_decimal_format(all_parsers):
parser = all_parsers
data = """Id;Number1;Number2;Text1;Text2;Number3
1;1521,1541;187101,9543;ABC;poi;4,738797819
2;121,12;14897,76;DEF;uyt;0,377320872
3;878,158;108013,434;GHI;rez;2,735694704"""
result = parser.read_csv(StringIO(data), sep=";", decimal=",")
expected = DataFrame(
[
[1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819],
[2, 121.12, 14897.76, "DEF", "uyt", 0.377320872],
[3, 878.158, 108013.434, "GHI", "rez", 2.735694704],
],
columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,416 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import (
BytesIO,
StringIO,
)
import os
import platform
from urllib.error import URLError
import uuid
import pytest
from pandas.errors import (
EmptyDataError,
ParserError,
)
import pandas.util._test_decorators as td
from pandas import DataFrame
import pandas._testing as tm
# TODO(1.4) Please xfail individual tests at release time
# instead of skip
pytestmark = pytest.mark.usefixtures("pyarrow_skip")
@pytest.mark.network
@pytest.mark.single_cpu
def test_url(all_parsers, csv_dir_path, httpserver):
parser = all_parsers
kwargs = {"sep": "\t"}
local_path = os.path.join(csv_dir_path, "salaries.csv")
with open(local_path, encoding="utf-8") as f:
httpserver.serve_content(content=f.read())
url_result = parser.read_csv(httpserver.url, **kwargs)
local_result = parser.read_csv(local_path, **kwargs)
tm.assert_frame_equal(url_result, local_result)
@pytest.mark.slow
def test_local_file(all_parsers, csv_dir_path):
parser = all_parsers
kwargs = {"sep": "\t"}
local_path = os.path.join(csv_dir_path, "salaries.csv")
local_result = parser.read_csv(local_path, **kwargs)
url = "file://localhost/" + local_path
try:
url_result = parser.read_csv(url, **kwargs)
tm.assert_frame_equal(url_result, local_result)
except URLError:
# Fails on some systems.
pytest.skip("Failing on: " + " ".join(platform.uname()))
def test_path_path_lib(all_parsers):
parser = all_parsers
df = tm.makeDataFrame()
result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0))
tm.assert_frame_equal(df, result)
def test_path_local_path(all_parsers):
parser = all_parsers
df = tm.makeDataFrame()
result = tm.round_trip_localpath(
df.to_csv, lambda p: parser.read_csv(p, index_col=0)
)
tm.assert_frame_equal(df, result)
def test_nonexistent_path(all_parsers):
# gh-2428: pls no segfault
# gh-14086: raise more helpful FileNotFoundError
# GH#29233 "File foo" instead of "File b'foo'"
parser = all_parsers
path = f"{uuid.uuid4()}.csv"
msg = r"\[Errno 2\]"
with pytest.raises(FileNotFoundError, match=msg) as e:
parser.read_csv(path)
assert path == e.value.filename
@td.skip_if_windows # os.chmod does not work in windows
def test_no_permission(all_parsers):
# GH 23784
parser = all_parsers
msg = r"\[Errno 13\]"
with tm.ensure_clean() as path:
os.chmod(path, 0) # make file unreadable
# verify that this process cannot open the file (not running as sudo)
try:
with open(path, encoding="utf-8"):
pass
pytest.skip("Running as sudo.")
except PermissionError:
pass
with pytest.raises(PermissionError, match=msg) as e:
parser.read_csv(path)
assert path == e.value.filename
@pytest.mark.parametrize(
"data,kwargs,expected,msg",
[
# gh-10728: WHITESPACE_LINE
(
"a,b,c\n4,5,6\n ",
{},
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
None,
),
# gh-10548: EAT_LINE_COMMENT
(
"a,b,c\n4,5,6\n#comment",
{"comment": "#"},
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
None,
),
# EAT_CRNL_NOP
(
"a,b,c\n4,5,6\n\r",
{},
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
None,
),
# EAT_COMMENT
(
"a,b,c\n4,5,6#comment",
{"comment": "#"},
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
None,
),
# SKIP_LINE
(
"a,b,c\n4,5,6\nskipme",
{"skiprows": [2]},
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
None,
),
# EAT_LINE_COMMENT
(
"a,b,c\n4,5,6\n#comment",
{"comment": "#", "skip_blank_lines": False},
DataFrame([[4, 5, 6]], columns=["a", "b", "c"]),
None,
),
# IN_FIELD
(
"a,b,c\n4,5,6\n ",
{"skip_blank_lines": False},
DataFrame([["4", 5, 6], [" ", None, None]], columns=["a", "b", "c"]),
None,
),
# EAT_CRNL
(
"a,b,c\n4,5,6\n\r",
{"skip_blank_lines": False},
DataFrame([[4, 5, 6], [None, None, None]], columns=["a", "b", "c"]),
None,
),
# ESCAPED_CHAR
(
"a,b,c\n4,5,6\n\\",
{"escapechar": "\\"},
None,
"(EOF following escape character)|(unexpected end of data)",
),
# ESCAPE_IN_QUOTED_FIELD
(
'a,b,c\n4,5,6\n"\\',
{"escapechar": "\\"},
None,
"(EOF inside string starting at row 2)|(unexpected end of data)",
),
# IN_QUOTED_FIELD
(
'a,b,c\n4,5,6\n"',
{"escapechar": "\\"},
None,
"(EOF inside string starting at row 2)|(unexpected end of data)",
),
],
ids=[
"whitespace-line",
"eat-line-comment",
"eat-crnl-nop",
"eat-comment",
"skip-line",
"eat-line-comment",
"in-field",
"eat-crnl",
"escaped-char",
"escape-in-quoted-field",
"in-quoted-field",
],
)
def test_eof_states(all_parsers, data, kwargs, expected, msg):
# see gh-10728, gh-10548
parser = all_parsers
if expected is None:
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
else:
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
def test_temporary_file(all_parsers):
# see gh-13398
parser = all_parsers
data = "0 0"
with tm.ensure_clean(mode="w+", return_filelike=True) as new_file:
new_file.write(data)
new_file.flush()
new_file.seek(0)
result = parser.read_csv(new_file, sep=r"\s+", header=None)
expected = DataFrame([[0, 0]])
tm.assert_frame_equal(result, expected)
def test_internal_eof_byte(all_parsers):
# see gh-5500
parser = all_parsers
data = "a,b\n1\x1a,2"
expected = DataFrame([["1\x1a", 2]], columns=["a", "b"])
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
def test_internal_eof_byte_to_file(all_parsers):
# see gh-16559
parser = all_parsers
data = b'c1,c2\r\n"test \x1a test", test\r\n'
expected = DataFrame([["test \x1a test", " test"]], columns=["c1", "c2"])
path = f"__{uuid.uuid4()}__.csv"
with tm.ensure_clean(path) as path:
with open(path, "wb") as f:
f.write(data)
result = parser.read_csv(path)
tm.assert_frame_equal(result, expected)
def test_file_handle_string_io(all_parsers):
# gh-14418
#
# Don't close user provided file handles.
parser = all_parsers
data = "a,b\n1,2"
fh = StringIO(data)
parser.read_csv(fh)
assert not fh.closed
def test_file_handles_with_open(all_parsers, csv1):
# gh-14418
#
# Don't close user provided file handles.
parser = all_parsers
for mode in ["r", "rb"]:
with open(csv1, mode, encoding="utf-8" if mode == "r" else None) as f:
parser.read_csv(f)
assert not f.closed
def test_invalid_file_buffer_class(all_parsers):
# see gh-15337
class InvalidBuffer:
pass
parser = all_parsers
msg = "Invalid file path or buffer object type"
with pytest.raises(ValueError, match=msg):
parser.read_csv(InvalidBuffer())
def test_invalid_file_buffer_mock(all_parsers):
# see gh-15337
parser = all_parsers
msg = "Invalid file path or buffer object type"
class Foo:
pass
with pytest.raises(ValueError, match=msg):
parser.read_csv(Foo())
def test_valid_file_buffer_seems_invalid(all_parsers):
# gh-16135: we want to ensure that "tell" and "seek"
# aren't actually being used when we call `read_csv`
#
# Thus, while the object may look "invalid" (these
# methods are attributes of the `StringIO` class),
# it is still a valid file-object for our purposes.
class NoSeekTellBuffer(StringIO):
def tell(self):
raise AttributeError("No tell method")
def seek(self, pos, whence=0):
raise AttributeError("No seek method")
data = "a\n1"
parser = all_parsers
expected = DataFrame({"a": [1]})
result = parser.read_csv(NoSeekTellBuffer(data))
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("io_class", [StringIO, BytesIO])
@pytest.mark.parametrize("encoding", [None, "utf-8"])
def test_read_csv_file_handle(all_parsers, io_class, encoding):
"""
Test whether read_csv does not close user-provided file handles.
GH 36980
"""
parser = all_parsers
expected = DataFrame({"a": [1], "b": [2]})
content = "a,b\n1,2"
handle = io_class(content.encode("utf-8") if io_class == BytesIO else content)
tm.assert_frame_equal(parser.read_csv(handle, encoding=encoding), expected)
assert not handle.closed
def test_memory_map_compression(all_parsers, compression):
"""
Support memory map for compressed files.
GH 37621
"""
parser = all_parsers
expected = DataFrame({"a": [1], "b": [2]})
with tm.ensure_clean() as path:
expected.to_csv(path, index=False, compression=compression)
tm.assert_frame_equal(
parser.read_csv(path, memory_map=True, compression=compression),
expected,
)
def test_context_manager(all_parsers, datapath):
# make sure that opened files are closed
parser = all_parsers
path = datapath("io", "data", "csv", "iris.csv")
reader = parser.read_csv(path, chunksize=1)
assert not reader.handles.handle.closed
try:
with reader:
next(reader)
assert False
except AssertionError:
assert reader.handles.handle.closed
def test_context_manageri_user_provided(all_parsers, datapath):
# make sure that user-provided handles are not closed
parser = all_parsers
with open(datapath("io", "data", "csv", "iris.csv"), encoding="utf-8") as path:
reader = parser.read_csv(path, chunksize=1)
assert not reader.handles.handle.closed
try:
with reader:
next(reader)
assert False
except AssertionError:
assert not reader.handles.handle.closed
def test_file_descriptor_leak(all_parsers, using_copy_on_write):
# GH 31488
parser = all_parsers
with tm.ensure_clean() as path:
with pytest.raises(EmptyDataError, match="No columns to parse from file"):
parser.read_csv(path)
def test_memory_map(all_parsers, csv_dir_path):
mmap_file = os.path.join(csv_dir_path, "test_mmap.csv")
parser = all_parsers
expected = DataFrame(
{"a": [1, 2, 3], "b": ["one", "two", "three"], "c": ["I", "II", "III"]}
)
result = parser.read_csv(mmap_file, memory_map=True)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,65 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import numpy as np
import pytest
from pandas.compat import is_platform_linux
from pandas import DataFrame
import pandas._testing as tm
pytestmark = pytest.mark.usefixtures("pyarrow_skip")
def test_float_parser(all_parsers):
# see gh-9565
parser = all_parsers
data = "45e-1,4.5,45.,inf,-inf"
result = parser.read_csv(StringIO(data), header=None)
expected = DataFrame([[float(s) for s in data.split(",")]])
tm.assert_frame_equal(result, expected)
def test_scientific_no_exponent(all_parsers_all_precisions):
# see gh-12215
df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]})
data = df.to_csv(index=False)
parser, precision = all_parsers_all_precisions
df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision)
tm.assert_frame_equal(df_roundtrip, df)
@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999])
def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
# GH#38753
parser, precision = all_parsers_all_precisions
data = f"data\n10E{neg_exp}"
result = parser.read_csv(StringIO(data), float_precision=precision)
expected = DataFrame({"data": [0.0]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
# GH#38753
parser, precision = all_parsers_all_precisions
data = f"data\n10E{exp}"
result = parser.read_csv(StringIO(data), float_precision=precision)
if precision == "round_trip":
if exp == 999999999999999999 and is_platform_linux():
mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
request.node.add_marker(mark)
value = np.inf if exp > 0 else 0.0
expected = DataFrame({"data": [value]})
else:
expected = DataFrame({"data": [f"10E{exp}"]})
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,299 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from datetime import datetime
from io import StringIO
import os
import pytest
from pandas import (
DataFrame,
Index,
MultiIndex,
)
import pandas._testing as tm
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
# GH#43650: Some expected failures with the pyarrow engine can occasionally
# cause a deadlock instead, so we skip these instead of xfailing
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@pytest.mark.parametrize(
"data,kwargs,expected",
[
(
"""foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
""",
{"index_col": 0, "names": ["index", "A", "B", "C", "D"]},
DataFrame(
[
[2, 3, 4, 5],
[7, 8, 9, 10],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
],
index=Index(["foo", "bar", "baz", "qux", "foo2", "bar2"], name="index"),
columns=["A", "B", "C", "D"],
),
),
(
"""foo,one,2,3,4,5
foo,two,7,8,9,10
foo,three,12,13,14,15
bar,one,12,13,14,15
bar,two,12,13,14,15
""",
{"index_col": [0, 1], "names": ["index1", "index2", "A", "B", "C", "D"]},
DataFrame(
[
[2, 3, 4, 5],
[7, 8, 9, 10],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
],
index=MultiIndex.from_tuples(
[
("foo", "one"),
("foo", "two"),
("foo", "three"),
("bar", "one"),
("bar", "two"),
],
names=["index1", "index2"],
),
columns=["A", "B", "C", "D"],
),
),
],
)
def test_pass_names_with_index(all_parsers, data, kwargs, expected):
parser = all_parsers
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
def test_multi_index_no_level_names(all_parsers, index_col):
data = """index1,index2,A,B,C,D
foo,one,2,3,4,5
foo,two,7,8,9,10
foo,three,12,13,14,15
bar,one,12,13,14,15
bar,two,12,13,14,15
"""
headless_data = "\n".join(data.split("\n")[1:])
names = ["A", "B", "C", "D"]
parser = all_parsers
result = parser.read_csv(
StringIO(headless_data), index_col=index_col, header=None, names=names
)
expected = parser.read_csv(StringIO(data), index_col=index_col)
# No index names in headless data.
expected.index.names = [None] * 2
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
def test_multi_index_no_level_names_implicit(all_parsers):
parser = all_parsers
data = """A,B,C,D
foo,one,2,3,4,5
foo,two,7,8,9,10
foo,three,12,13,14,15
bar,one,12,13,14,15
bar,two,12,13,14,15
"""
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[
[2, 3, 4, 5],
[7, 8, 9, 10],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
],
columns=["A", "B", "C", "D"],
index=MultiIndex.from_tuples(
[
("foo", "one"),
("foo", "two"),
("foo", "three"),
("bar", "one"),
("bar", "two"),
]
),
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
@pytest.mark.parametrize(
"data,expected,header",
[
("a,b", DataFrame(columns=["a", "b"]), [0]),
(
"a,b\nc,d",
DataFrame(columns=MultiIndex.from_tuples([("a", "c"), ("b", "d")])),
[0, 1],
),
],
)
@pytest.mark.parametrize("round_trip", [True, False])
def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip):
# see gh-14545
parser = all_parsers
data = expected.to_csv(index=False) if round_trip else data
result = parser.read_csv(StringIO(data), header=header)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
def test_no_unnamed_index(all_parsers):
parser = all_parsers
data = """ id c0 c1 c2
0 1 0 a b
1 2 0 c d
2 2 2 e f
"""
result = parser.read_csv(StringIO(data), sep=" ")
expected = DataFrame(
[[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], [2, 2, 2, "e", "f"]],
columns=["Unnamed: 0", "id", "c0", "c1", "c2"],
)
tm.assert_frame_equal(result, expected)
def test_read_duplicate_index_explicit(all_parsers):
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo,12,13,14,15
bar,12,13,14,15
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=0)
expected = DataFrame(
[
[2, 3, 4, 5],
[7, 8, 9, 10],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
],
columns=["A", "B", "C", "D"],
index=Index(["foo", "bar", "baz", "qux", "foo", "bar"], name="index"),
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
def test_read_duplicate_index_implicit(all_parsers):
data = """A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo,12,13,14,15
bar,12,13,14,15
"""
parser = all_parsers
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[
[2, 3, 4, 5],
[7, 8, 9, 10],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
[12, 13, 14, 15],
],
columns=["A", "B", "C", "D"],
index=Index(["foo", "bar", "baz", "qux", "foo", "bar"]),
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
def test_read_csv_no_index_name(all_parsers, csv_dir_path):
parser = all_parsers
csv2 = os.path.join(csv_dir_path, "test2.csv")
result = parser.read_csv(csv2, index_col=0, parse_dates=True)
expected = DataFrame(
[
[0.980269, 3.685731, -0.364216805298, -1.159738, "foo"],
[1.047916, -0.041232, -0.16181208307, 0.212549, "bar"],
[0.498581, 0.731168, -0.537677223318, 1.346270, "baz"],
[1.120202, 1.567621, 0.00364077397681, 0.675253, "qux"],
[-0.487094, 0.571455, -1.6116394093, 0.103469, "foo2"],
],
columns=["A", "B", "C", "D", "E"],
index=Index(
[
datetime(2000, 1, 3),
datetime(2000, 1, 4),
datetime(2000, 1, 5),
datetime(2000, 1, 6),
datetime(2000, 1, 7),
]
),
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
def test_empty_with_index(all_parsers):
# see gh-10184
data = "x,y"
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=0)
expected = DataFrame(columns=["y"], index=Index([], name="x"))
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_empty_with_multi_index(all_parsers):
# see gh-10467
data = "x,y,z"
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=["x", "y"])
expected = DataFrame(
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["x", "y"])
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_empty_with_reversed_multi_index(all_parsers):
data = "x,y,z"
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=[1, 0])
expected = DataFrame(
columns=["z"], index=MultiIndex.from_arrays([[]] * 2, names=["y", "x"])
)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,70 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import numpy as np
import pytest
from pandas import (
DataFrame,
option_context,
)
import pandas._testing as tm
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
@xfail_pyarrow
@pytest.mark.parametrize("na_filter", [True, False])
def test_inf_parsing(all_parsers, na_filter):
parser = all_parsers
data = """\
,A
a,inf
b,-inf
c,+Inf
d,-Inf
e,INF
f,-INF
g,+INf
h,-INf
i,inF
j,-inF"""
expected = DataFrame(
{"A": [float("inf"), float("-inf")] * 5},
index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
)
result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
@pytest.mark.parametrize("na_filter", [True, False])
def test_infinity_parsing(all_parsers, na_filter):
parser = all_parsers
data = """\
,A
a,Infinity
b,-Infinity
c,+Infinity
"""
expected = DataFrame(
{"A": [float("infinity"), float("-infinity"), float("+infinity")]},
index=["a", "b", "c"],
)
result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter)
tm.assert_frame_equal(result, expected)
def test_read_csv_with_use_inf_as_na(all_parsers):
# https://github.com/pandas-dev/pandas/issues/35493
parser = all_parsers
data = "1.0\nNaN\n3.0"
msg = "use_inf_as_na option is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
with option_context("use_inf_as_na", True):
result = parser.read_csv(StringIO(data), header=None)
expected = DataFrame([1.0, np.nan, 3.0])
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,215 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import numpy as np
import pytest
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
# GH#43650: Some expected failures with the pyarrow engine can occasionally
# cause a deadlock instead, so we skip these instead of xfailing
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
def test_int_conversion(all_parsers):
data = """A,B
1.0,1
2.0,2
3.0,3
"""
parser = all_parsers
result = parser.read_csv(StringIO(data))
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,kwargs,expected",
[
(
"A,B\nTrue,1\nFalse,2\nTrue,3",
{},
DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
),
(
"A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3",
{"true_values": ["yes", "Yes", "YES"], "false_values": ["no", "NO", "No"]},
DataFrame(
[[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]],
columns=["A", "B"],
),
),
(
"A,B\nTRUE,1\nFALSE,2\nTRUE,3",
{},
DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
),
(
"A,B\nfoo,bar\nbar,foo",
{"true_values": ["foo"], "false_values": ["bar"]},
DataFrame([[True, False], [False, True]], columns=["A", "B"]),
),
],
)
def test_parse_bool(all_parsers, data, kwargs, expected):
parser = all_parsers
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
def test_parse_integers_above_fp_precision(all_parsers):
data = """Numbers
17007000002000191
17007000002000191
17007000002000191
17007000002000191
17007000002000192
17007000002000192
17007000002000192
17007000002000192
17007000002000192
17007000002000194"""
parser = all_parsers
result = parser.read_csv(StringIO(data))
expected = DataFrame(
{
"Numbers": [
17007000002000191,
17007000002000191,
17007000002000191,
17007000002000191,
17007000002000192,
17007000002000192,
17007000002000192,
17007000002000192,
17007000002000192,
17007000002000194,
]
}
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # Flaky
@pytest.mark.parametrize("sep", [" ", r"\s+"])
def test_integer_overflow_bug(all_parsers, sep):
# see gh-2601
data = "65248E10 11\n55555E55 22\n"
parser = all_parsers
result = parser.read_csv(StringIO(data), header=None, sep=sep)
expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]])
tm.assert_frame_equal(result, expected)
def test_int64_min_issues(all_parsers):
# see gh-2599
parser = all_parsers
data = "A,B\n0,0\n0,"
result = parser.read_csv(StringIO(data))
expected = DataFrame({"A": [0, 0], "B": [0, np.nan]})
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
def test_int64_overflow(all_parsers, conv):
data = """ID
00013007854817840016671868
00013007854817840016749251
00013007854817840016754630
00013007854817840016781876
00013007854817840017028824
00013007854817840017963235
00013007854817840018860166"""
parser = all_parsers
if conv is None:
# 13007854817840016671868 > UINT64_MAX, so this
# will overflow and return object as the dtype.
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[
"00013007854817840016671868",
"00013007854817840016749251",
"00013007854817840016754630",
"00013007854817840016781876",
"00013007854817840017028824",
"00013007854817840017963235",
"00013007854817840018860166",
],
columns=["ID"],
)
tm.assert_frame_equal(result, expected)
else:
# 13007854817840016671868 > UINT64_MAX, so attempts
# to cast to either int64 or uint64 will result in
# an OverflowError being raised.
msg = (
"(Python int too large to convert to C long)|"
"(long too big to convert)|"
"(int too big to convert)"
)
with pytest.raises(OverflowError, match=msg):
parser.read_csv(StringIO(data), converters={"ID": conv})
@skip_pyarrow
@pytest.mark.parametrize(
"val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min]
)
def test_int64_uint64_range(all_parsers, val):
# These numbers fall right inside the int64-uint64
# range, so they should be parsed as string.
parser = all_parsers
result = parser.read_csv(StringIO(str(val)), header=None)
expected = DataFrame([val])
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize(
"val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
)
def test_outside_int64_uint64_range(all_parsers, val):
# These numbers fall just outside the int64-uint64
# range, so they should be parsed as string.
parser = all_parsers
result = parser.read_csv(StringIO(str(val)), header=None)
expected = DataFrame([str(val)])
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)], [str(2**63), str(-1)]])
def test_numeric_range_too_wide(all_parsers, exp_data):
# No numerical dtype can hold both negative and uint64
# values, so they should be cast as string.
parser = all_parsers
data = "\n".join(exp_data)
expected = DataFrame(exp_data)
result = parser.read_csv(StringIO(data), header=None)
tm.assert_frame_equal(result, expected)
def test_integer_precision(all_parsers):
# Gh 7072
s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765
5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389"""
parser = all_parsers
result = parser.read_csv(StringIO(s), header=None)[4]
expected = Series([4321583677327450765, 4321113141090630389], name=4)
tm.assert_series_equal(result, expected)

View File

@@ -0,0 +1,108 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import pytest
from pandas import (
DataFrame,
concat,
)
import pandas._testing as tm
pytestmark = pytest.mark.usefixtures("pyarrow_skip")
def test_iterator(all_parsers):
# see gh-6607
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
parser = all_parsers
kwargs = {"index_col": 0}
expected = parser.read_csv(StringIO(data), **kwargs)
with parser.read_csv(StringIO(data), iterator=True, **kwargs) as reader:
first_chunk = reader.read(3)
tm.assert_frame_equal(first_chunk, expected[:3])
last_chunk = reader.read(5)
tm.assert_frame_equal(last_chunk, expected[3:])
def test_iterator2(all_parsers):
parser = all_parsers
data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
with parser.read_csv(StringIO(data), iterator=True) as reader:
result = list(reader)
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=["foo", "bar", "baz"],
columns=["A", "B", "C"],
)
tm.assert_frame_equal(result[0], expected)
def test_iterator_stop_on_chunksize(all_parsers):
# gh-3967: stopping iteration when chunksize is specified
parser = all_parsers
data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
with parser.read_csv(StringIO(data), chunksize=1) as reader:
result = list(reader)
assert len(result) == 3
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=["foo", "bar", "baz"],
columns=["A", "B", "C"],
)
tm.assert_frame_equal(concat(result), expected)
@pytest.mark.parametrize(
"kwargs", [{"iterator": True, "chunksize": 1}, {"iterator": True}, {"chunksize": 1}]
)
def test_iterator_skipfooter_errors(all_parsers, kwargs):
msg = "'skipfooter' not supported for iteration"
parser = all_parsers
data = "a\n1\n2"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(StringIO(data), skipfooter=1, **kwargs) as _:
pass
def test_iteration_open_handle(all_parsers):
parser = all_parsers
kwargs = {"header": None}
with tm.ensure_clean() as path:
with open(path, "w", encoding="utf-8") as f:
f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG")
with open(path, encoding="utf-8") as f:
for line in f:
if "CCC" in line:
break
result = parser.read_csv(f, **kwargs)
expected = DataFrame({0: ["DDD", "EEE", "FFF", "GGG"]})
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,272 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
import codecs
import csv
from io import StringIO
import os
from pathlib import Path
import numpy as np
import pytest
from pandas.compat import PY311
from pandas.errors import (
EmptyDataError,
ParserError,
)
from pandas import DataFrame
import pandas._testing as tm
pytestmark = pytest.mark.usefixtures("pyarrow_skip")
def test_empty_decimal_marker(all_parsers):
data = """A|B|C
1|2,334|5
10|13|10.
"""
# Parsers support only length-1 decimals
msg = "Only length-1 decimal markers supported"
parser = all_parsers
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), decimal="")
def test_bad_stream_exception(all_parsers, csv_dir_path):
# see gh-13652
#
# This test validates that both the Python engine and C engine will
# raise UnicodeDecodeError instead of C engine raising ParserError
# and swallowing the exception that caused read to fail.
path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv")
codec = codecs.lookup("utf-8")
utf8 = codecs.lookup("utf-8")
parser = all_parsers
msg = "'utf-8' codec can't decode byte"
# Stream must be binary UTF8.
with open(path, "rb") as handle, codecs.StreamRecoder(
handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter
) as stream:
with pytest.raises(UnicodeDecodeError, match=msg):
parser.read_csv(stream)
def test_malformed(all_parsers):
# see gh-6607
parser = all_parsers
data = """ignore
A,B,C
1,2,3 # comment
1,2,3,4,5
2,3,4
"""
msg = "Expected 3 fields in line 4, saw 5"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), header=1, comment="#")
@pytest.mark.parametrize("nrows", [5, 3, None])
def test_malformed_chunks(all_parsers, nrows):
data = """ignore
A,B,C
skip
1,2,3
3,5,10 # comment
1,2,3,4,5
2,3,4
"""
parser = all_parsers
msg = "Expected 3 fields in line 6, saw 5"
with parser.read_csv(
StringIO(data), header=1, comment="#", iterator=True, chunksize=1, skiprows=[2]
) as reader:
with pytest.raises(ParserError, match=msg):
reader.read(nrows)
def test_catch_too_many_names(all_parsers):
# see gh-5156
data = """\
1,2,3
4,,6
7,8,9
10,11,12\n"""
parser = all_parsers
msg = (
"Too many columns specified: expected 4 and found 3"
if parser.engine == "c"
else "Number of passed names did not match "
"number of header fields in the file"
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"])
@pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5])
def test_raise_on_no_columns(all_parsers, nrows):
parser = all_parsers
data = "\n" * nrows
msg = "No columns to parse from file"
with pytest.raises(EmptyDataError, match=msg):
parser.read_csv(StringIO(data))
def test_unexpected_keyword_parameter_exception(all_parsers):
# GH-34976
parser = all_parsers
msg = "{}\\(\\) got an unexpected keyword argument 'foo'"
with pytest.raises(TypeError, match=msg.format("read_csv")):
parser.read_csv("foo.csv", foo=1)
with pytest.raises(TypeError, match=msg.format("read_table")):
parser.read_table("foo.tsv", foo=1)
def test_suppress_error_output(all_parsers, capsys):
# see gh-15925
parser = all_parsers
data = "a\n1\n1,2,3\n4\n5,6,7"
expected = DataFrame({"a": [1, 4]})
result = parser.read_csv(StringIO(data), on_bad_lines="skip")
tm.assert_frame_equal(result, expected)
captured = capsys.readouterr()
assert captured.err == ""
def test_error_bad_lines(all_parsers):
# see gh-15925
parser = all_parsers
data = "a\n1\n1,2,3\n4\n5,6,7"
msg = "Expected 1 fields in line 3, saw 3"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), on_bad_lines="error")
def test_warn_bad_lines(all_parsers, capsys):
# see gh-15925
parser = all_parsers
data = "a\n1\n1,2,3\n4\n5,6,7"
expected = DataFrame({"a": [1, 4]})
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
tm.assert_frame_equal(result, expected)
captured = capsys.readouterr()
assert "Skipping line 3" in captured.err
assert "Skipping line 5" in captured.err
def test_read_csv_wrong_num_columns(all_parsers):
# Too few columns.
data = """A,B,C,D,E,F
1,2,3,4,5,6
6,7,8,9,10,11,12
11,12,13,14,15,16
"""
parser = all_parsers
msg = "Expected 6 fields in line 3, saw 7"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data))
def test_null_byte_char(request, all_parsers):
# see gh-2741
data = "\x00,foo"
names = ["a", "b"]
parser = all_parsers
if parser.engine == "c" or (parser.engine == "python" and PY311):
if parser.engine == "python" and PY311:
request.node.add_marker(
pytest.mark.xfail(
reason="In Python 3.11, this is read as an empty character not null"
)
)
expected = DataFrame([[np.nan, "foo"]], columns=names)
out = parser.read_csv(StringIO(data), names=names)
tm.assert_frame_equal(out, expected)
else:
msg = "NULL byte detected"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), names=names)
@pytest.mark.filterwarnings("always::ResourceWarning")
def test_open_file(request, all_parsers):
# GH 39024
parser = all_parsers
if parser.engine == "c":
request.node.add_marker(
pytest.mark.xfail(
reason=f"{parser.engine} engine does not support sep=None "
f"with delim_whitespace=False"
)
)
with tm.ensure_clean() as path:
file = Path(path)
file.write_bytes(b"\xe4\na\n1")
with tm.assert_produces_warning(None):
# should not trigger a ResourceWarning
with pytest.raises(csv.Error, match="Could not determine delimiter"):
parser.read_csv(file, sep=None, encoding_errors="replace")
def test_invalid_on_bad_line(all_parsers):
parser = all_parsers
data = "a\n1\n1,2,3\n4\n5,6,7"
with pytest.raises(ValueError, match="Argument abc is invalid for on_bad_lines"):
parser.read_csv(StringIO(data), on_bad_lines="abc")
def test_bad_header_uniform_error(all_parsers):
parser = all_parsers
data = "+++123456789...\ncol1,col2,col3,col4\n1,2,3,4\n"
msg = "Expected 2 fields in line 2, saw 4"
if parser.engine == "c":
msg = (
"Could not construct index. Requested to use 1 "
"number of columns, but 3 left to parse."
)
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error")
def test_on_bad_lines_warn_correct_formatting(all_parsers, capsys):
# see gh-15925
parser = all_parsers
data = """1,2
a,b
a,b,c
a,b,d
a,b
"""
expected = DataFrame({"1": "a", "2": ["b"] * 2})
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
tm.assert_frame_equal(result, expected)
captured = capsys.readouterr()
if parser.engine == "c":
warn = """Skipping line 3: expected 2 fields, saw 3
Skipping line 4: expected 2 fields, saw 3
"""
else:
warn = """Skipping line 3: Expected 2 fields in line 3, saw 3
Skipping line 4: Expected 2 fields in line 4, saw 3
"""
assert captured.err == warn

View File

@@ -0,0 +1,55 @@
"""
Tests that work on both the Python and C engines but do not have a
specific classification into the other test modules.
"""
from io import StringIO
import pytest
pytestmark = pytest.mark.usefixtures("pyarrow_skip")
def test_verbose_read(all_parsers, capsys):
parser = all_parsers
data = """a,b,c,d
one,1,2,3
one,1,2,3
,1,2,3
one,1,2,3
,1,2,3
,1,2,3
one,1,2,3
two,1,2,3"""
# Engines are verbose in different ways.
parser.read_csv(StringIO(data), verbose=True)
captured = capsys.readouterr()
if parser.engine == "c":
assert "Tokenization took:" in captured.out
assert "Parser memory cleanup took:" in captured.out
else: # Python engine
assert captured.out == "Filled 3 NA values in column a\n"
def test_verbose_read2(all_parsers, capsys):
parser = all_parsers
data = """a,b,c,d
one,1,2,3
two,1,2,3
three,1,2,3
four,1,2,3
five,1,2,3
,1,2,3
seven,1,2,3
eight,1,2,3"""
parser.read_csv(StringIO(data), verbose=True, index_col=0)
captured = capsys.readouterr()
# Engines are verbose in different ways.
if parser.engine == "c":
assert "Tokenization took:" in captured.out
assert "Parser memory cleanup took:" in captured.out
else: # Python engine
assert captured.out == "Filled 1 NA values in column a\n"

View File

@@ -0,0 +1,297 @@
from __future__ import annotations
import os
import pytest
from pandas.compat._optional import VERSIONS
from pandas import (
read_csv,
read_table,
)
import pandas._testing as tm
class BaseParser:
engine: str | None = None
low_memory = True
float_precision_choices: list[str | None] = []
def update_kwargs(self, kwargs):
kwargs = kwargs.copy()
kwargs.update({"engine": self.engine, "low_memory": self.low_memory})
return kwargs
def read_csv(self, *args, **kwargs):
kwargs = self.update_kwargs(kwargs)
return read_csv(*args, **kwargs)
def read_csv_check_warnings(
self, warn_type: type[Warning], warn_msg: str, *args, **kwargs
):
# We need to check the stacklevel here instead of in the tests
# since this is where read_csv is called and where the warning
# should point to.
kwargs = self.update_kwargs(kwargs)
with tm.assert_produces_warning(warn_type, match=warn_msg):
return read_csv(*args, **kwargs)
def read_table(self, *args, **kwargs):
kwargs = self.update_kwargs(kwargs)
return read_table(*args, **kwargs)
def read_table_check_warnings(
self, warn_type: type[Warning], warn_msg: str, *args, **kwargs
):
# We need to check the stacklevel here instead of in the tests
# since this is where read_table is called and where the warning
# should point to.
kwargs = self.update_kwargs(kwargs)
with tm.assert_produces_warning(warn_type, match=warn_msg):
return read_table(*args, **kwargs)
class CParser(BaseParser):
engine = "c"
float_precision_choices = [None, "high", "round_trip"]
class CParserHighMemory(CParser):
low_memory = False
class CParserLowMemory(CParser):
low_memory = True
class PythonParser(BaseParser):
engine = "python"
float_precision_choices = [None]
class PyArrowParser(BaseParser):
engine = "pyarrow"
float_precision_choices = [None]
@pytest.fixture
def csv_dir_path(datapath):
"""
The directory path to the data files needed for parser tests.
"""
return datapath("io", "parser", "data")
@pytest.fixture
def csv1(datapath):
"""
The path to the data file "test1.csv" needed for parser tests.
"""
return os.path.join(datapath("io", "data", "csv"), "test1.csv")
_cParserHighMemory = CParserHighMemory
_cParserLowMemory = CParserLowMemory
_pythonParser = PythonParser
_pyarrowParser = PyArrowParser
_py_parsers_only = [_pythonParser]
_c_parsers_only = [_cParserHighMemory, _cParserLowMemory]
_pyarrow_parsers_only = [pytest.param(_pyarrowParser, marks=pytest.mark.single_cpu)]
_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only]
_py_parser_ids = ["python"]
_c_parser_ids = ["c_high", "c_low"]
_pyarrow_parsers_ids = ["pyarrow"]
_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parsers_ids]
@pytest.fixture(params=_all_parsers, ids=_all_parser_ids)
def all_parsers(request):
"""
Fixture all of the CSV parsers.
"""
parser = request.param()
if parser.engine == "pyarrow":
pytest.importorskip("pyarrow", VERSIONS["pyarrow"])
# Try finding a way to disable threads all together
# for more stable CI runs
import pyarrow
pyarrow.set_cpu_count(1)
return parser
@pytest.fixture(params=_c_parsers_only, ids=_c_parser_ids)
def c_parser_only(request):
"""
Fixture all of the CSV parsers using the C engine.
"""
return request.param()
@pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids)
def python_parser_only(request):
"""
Fixture all of the CSV parsers using the Python engine.
"""
return request.param()
@pytest.fixture(params=_pyarrow_parsers_only, ids=_pyarrow_parsers_ids)
def pyarrow_parser_only(request):
"""
Fixture all of the CSV parsers using the Pyarrow engine.
"""
return request.param()
def _get_all_parser_float_precision_combinations():
"""
Return all allowable parser and float precision
combinations and corresponding ids.
"""
params = []
ids = []
for parser, parser_id in zip(_all_parsers, _all_parser_ids):
if hasattr(parser, "values"):
# Wrapped in pytest.param, get the actual parser back
parser = parser.values[0]
for precision in parser.float_precision_choices:
# Re-wrap in pytest.param for pyarrow
mark = pytest.mark.single_cpu if parser.engine == "pyarrow" else ()
param = pytest.param((parser(), precision), marks=mark)
params.append(param)
ids.append(f"{parser_id}-{precision}")
return {"params": params, "ids": ids}
@pytest.fixture(
params=_get_all_parser_float_precision_combinations()["params"],
ids=_get_all_parser_float_precision_combinations()["ids"],
)
def all_parsers_all_precisions(request):
"""
Fixture for all allowable combinations of parser
and float precision
"""
return request.param
_utf_values = [8, 16, 32]
_encoding_seps = ["", "-", "_"]
_encoding_prefixes = ["utf", "UTF"]
_encoding_fmts = [
f"{prefix}{sep}{{0}}" for sep in _encoding_seps for prefix in _encoding_prefixes
]
@pytest.fixture(params=_utf_values)
def utf_value(request):
"""
Fixture for all possible integer values for a UTF encoding.
"""
return request.param
@pytest.fixture(params=_encoding_fmts)
def encoding_fmt(request):
"""
Fixture for all possible string formats of a UTF encoding.
"""
return request.param
@pytest.fixture(
params=[
("-1,0", -1.0),
("-1,2e0", -1.2),
("-1e0", -1.0),
("+1e0", 1.0),
("+1e+0", 1.0),
("+1e-1", 0.1),
("+,1e1", 1.0),
("+1,e0", 1.0),
("-,1e1", -1.0),
("-1,e0", -1.0),
("0,1", 0.1),
("1,", 1.0),
(",1", 0.1),
("-,1", -0.1),
("1_,", 1.0),
("1_234,56", 1234.56),
("1_234,56e0", 1234.56),
# negative cases; must not parse as float
("_", "_"),
("-_", "-_"),
("-_1", "-_1"),
("-_1e0", "-_1e0"),
("_1", "_1"),
("_1,", "_1,"),
("_1,_", "_1,_"),
("_1e0", "_1e0"),
("1,2e_1", "1,2e_1"),
("1,2e1_0", "1,2e1_0"),
("1,_2", "1,_2"),
(",1__2", ",1__2"),
(",1e", ",1e"),
("-,1e", "-,1e"),
("1_000,000_000", "1_000,000_000"),
("1,e1_2", "1,e1_2"),
("e11,2", "e11,2"),
("1e11,2", "1e11,2"),
("1,2,2", "1,2,2"),
("1,2_1", "1,2_1"),
("1,2e-10e1", "1,2e-10e1"),
("--1,2", "--1,2"),
("1a_2,1", "1a_2,1"),
("1,2E-1", 0.12),
("1,2E1", 12.0),
]
)
def numeric_decimal(request):
"""
Fixture for all numeric formats which should get recognized. The first entry
represents the value to read while the second represents the expected result.
"""
return request.param
@pytest.fixture
def pyarrow_xfail(request):
"""
Fixture that xfails a test if the engine is pyarrow.
"""
if "all_parsers" in request.fixturenames:
parser = request.getfixturevalue("all_parsers")
elif "all_parsers_all_precisions" in request.fixturenames:
# Return value is tuple of (engine, precision)
parser = request.getfixturevalue("all_parsers_all_precisions")[0]
else:
return
if parser.engine == "pyarrow":
mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
request.node.add_marker(mark)
@pytest.fixture
def pyarrow_skip(request):
"""
Fixture that skips a test if the engine is pyarrow.
"""
if "all_parsers" in request.fixturenames:
parser = request.getfixturevalue("all_parsers")
elif "all_parsers_all_precisions" in request.fixturenames:
# Return value is tuple of (engine, precision)
parser = request.getfixturevalue("all_parsers_all_precisions")[0]
else:
return
if parser.engine == "pyarrow":
pytest.skip("pyarrow doesn't support this.")

View File

@@ -0,0 +1,314 @@
"""
Tests dtype specification during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import os
import numpy as np
import pytest
from pandas._libs import parsers as libparsers
from pandas.core.dtypes.dtypes import CategoricalDtype
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Timestamp,
)
import pandas._testing as tm
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@xfail_pyarrow
@pytest.mark.parametrize(
"dtype",
[
"category",
CategoricalDtype(),
{"a": "category", "b": "category", "c": CategoricalDtype()},
],
)
def test_categorical_dtype(all_parsers, dtype):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,a,3.4
1,a,3.4
2,b,4.5"""
expected = DataFrame(
{
"a": Categorical(["1", "1", "2"]),
"b": Categorical(["a", "a", "b"]),
"c": Categorical(["3.4", "3.4", "4.5"]),
}
)
actual = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(actual, expected)
@skip_pyarrow # Flaky
@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}])
def test_categorical_dtype_single(all_parsers, dtype):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,a,3.4
1,a,3.4
2,b,4.5"""
expected = DataFrame(
{"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]}
)
actual = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(actual, expected)
@xfail_pyarrow
def test_categorical_dtype_unsorted(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,b,3.4
1,b,3.4
2,a,4.5"""
expected = DataFrame(
{
"a": Categorical(["1", "1", "2"]),
"b": Categorical(["b", "b", "a"]),
"c": Categorical(["3.4", "3.4", "4.5"]),
}
)
actual = parser.read_csv(StringIO(data), dtype="category")
tm.assert_frame_equal(actual, expected)
@xfail_pyarrow
def test_categorical_dtype_missing(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,b,3.4
1,nan,3.4
2,a,4.5"""
expected = DataFrame(
{
"a": Categorical(["1", "1", "2"]),
"b": Categorical(["b", np.nan, "a"]),
"c": Categorical(["3.4", "3.4", "4.5"]),
}
)
actual = parser.read_csv(StringIO(data), dtype="category")
tm.assert_frame_equal(actual, expected)
@xfail_pyarrow
@pytest.mark.slow
def test_categorical_dtype_high_cardinality_numeric(all_parsers, monkeypatch):
# see gh-18186
# was an issue with C parser, due to DEFAULT_BUFFER_HEURISTIC
parser = all_parsers
heuristic = 2**5
data = np.sort([str(i) for i in range(heuristic + 1)])
expected = DataFrame({"a": Categorical(data, ordered=True)})
with monkeypatch.context() as m:
m.setattr(libparsers, "DEFAULT_BUFFER_HEURISTIC", heuristic)
actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category")
actual["a"] = actual["a"].cat.reorder_categories(
np.sort(actual.a.cat.categories), ordered=True
)
tm.assert_frame_equal(actual, expected)
def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
# see gh-10153
pth = os.path.join(csv_dir_path, "utf16_ex.txt")
parser = all_parsers
encoding = "utf-16"
sep = "\t"
expected = parser.read_csv(pth, sep=sep, encoding=encoding)
expected = expected.apply(Categorical)
actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category")
tm.assert_frame_equal(actual, expected)
@xfail_pyarrow
def test_categorical_dtype_chunksize_infer_categories(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
expecteds = [
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}),
DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]),
]
with parser.read_csv(
StringIO(data), dtype={"b": "category"}, chunksize=2
) as actuals:
for actual, expected in zip(actuals, expecteds):
tm.assert_frame_equal(actual, expected)
@xfail_pyarrow
def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
cats = ["a", "b", "c"]
expecteds = [
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}),
DataFrame(
{"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)},
index=[2, 3],
),
]
dtype = CategoricalDtype(cats)
with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals:
for actual, expected in zip(actuals, expecteds):
tm.assert_frame_equal(actual, expected)
def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
# see gh-10153
pth = os.path.join(csv_dir_path, "unicode_series.csv")
parser = all_parsers
encoding = "latin-1"
expected = parser.read_csv(pth, header=None, encoding=encoding)
expected[1] = Categorical(expected[1])
actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"})
tm.assert_frame_equal(actual, expected)
@pytest.mark.parametrize("ordered", [False, True])
@pytest.mark.parametrize(
"categories",
[["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]],
)
def test_categorical_category_dtype(all_parsers, categories, ordered):
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
expected = DataFrame(
{
"a": [1, 1, 1, 2],
"b": Categorical(
["a", "b", "b", "c"], categories=categories, ordered=ordered
),
}
)
dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)}
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_category_dtype_unsorted(all_parsers):
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
dtype = CategoricalDtype(["c", "b", "a"])
expected = DataFrame(
{
"a": [1, 1, 1, 2],
"b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]),
}
)
result = parser.read_csv(StringIO(data), dtype={"b": dtype})
tm.assert_frame_equal(result, expected)
def test_categorical_coerces_numeric(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype([1, 2, 3])}
data = "b\n1\n1\n2\n3"
expected = DataFrame({"b": Categorical([1, 1, 2, 3])})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
@skip_pyarrow # Flaky
def test_categorical_coerces_datetime(all_parsers):
parser = all_parsers
dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None)
dtype = {"b": CategoricalDtype(dti)}
data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_coerces_timestamp(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype([Timestamp("2014")])}
data = "b\n2014-01-01\n2014-01-01"
expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_coerces_timedelta(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))}
data = "b\n1H\n2H\n3H"
expected = DataFrame({"b": Categorical(dtype["b"].categories)})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data",
[
"b\nTrue\nFalse\nNA\nFalse",
"b\ntrue\nfalse\nNA\nfalse",
"b\nTRUE\nFALSE\nNA\nFALSE",
"b\nTrue\nFalse\nNA\nFALSE",
],
)
def test_categorical_dtype_coerces_boolean(all_parsers, data):
# see gh-20498
parser = all_parsers
dtype = {"b": CategoricalDtype([False, True])}
expected = DataFrame({"b": Categorical([True, False, None, False])})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_categorical_unexpected_categories(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}
data = "b\nd\na\nc\nd" # Unexpected c
expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])})
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,577 @@
"""
Tests dtype specification during parsing
for all of the parsers defined in parsers.py
"""
from collections import defaultdict
from io import StringIO
import numpy as np
import pytest
from pandas.errors import ParserWarning
import pandas as pd
from pandas import (
DataFrame,
Timestamp,
)
import pandas._testing as tm
from pandas.core.arrays import (
ArrowStringArray,
IntegerArray,
StringArray,
)
@pytest.mark.parametrize("dtype", [str, object])
@pytest.mark.parametrize("check_orig", [True, False])
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_all_columns(all_parsers, dtype, check_orig):
# see gh-3795, gh-6607
parser = all_parsers
df = DataFrame(
np.random.default_rng(2).random((5, 2)).round(4),
columns=list("AB"),
index=["1A", "1B", "1C", "1D", "1E"],
)
with tm.ensure_clean("__passing_str_as_dtype__.csv") as path:
df.to_csv(path)
result = parser.read_csv(path, dtype=dtype, index_col=0)
if check_orig:
expected = df.copy()
result = result.astype(float)
else:
expected = df.astype(str)
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_per_column(all_parsers):
parser = all_parsers
data = """\
one,two
1,2.5
2,3.5
3,4.5
4,5.5"""
expected = DataFrame(
[[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"]
)
expected["one"] = expected["one"].astype(np.float64)
expected["two"] = expected["two"].astype(object)
result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str})
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_invalid_dtype_per_column(all_parsers):
parser = all_parsers
data = """\
one,two
1,2.5
2,3.5
3,4.5
4,5.5"""
with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"):
parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"})
@pytest.mark.usefixtures("pyarrow_xfail")
def test_raise_on_passed_int_dtype_with_nas(all_parsers):
# see gh-2631
parser = all_parsers
data = """YEAR, DOY, a
2001,106380451,10
2001,,11
2001,106380451,67"""
msg = (
"Integer column has NA values"
if parser.engine == "c"
else "Unable to convert column DOY"
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, skipinitialspace=True)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_with_converters(all_parsers):
parser = all_parsers
data = """a,b
1.1,2.2
1.2,2.3"""
# Dtype spec ignored if converted specified.
result = parser.read_csv_check_warnings(
ParserWarning,
"Both a converter and dtype were specified for column a "
"- only the converter will be used.",
StringIO(data),
dtype={"a": "i8"},
converters={"a": lambda x: str(x)},
)
expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype", list(np.typecodes["AllInteger"] + np.typecodes["Float"])
)
def test_numeric_dtype(all_parsers, dtype):
data = "0\n1"
parser = all_parsers
expected = DataFrame([0, 1], dtype=dtype)
result = parser.read_csv(StringIO(data), header=None, dtype=dtype)
tm.assert_frame_equal(expected, result)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_boolean_dtype(all_parsers):
parser = all_parsers
data = "\n".join(
[
"a",
"True",
"TRUE",
"true",
"1",
"1.0",
"False",
"FALSE",
"false",
"0",
"0.0",
"NaN",
"nan",
"NA",
"null",
"NULL",
]
)
result = parser.read_csv(StringIO(data), dtype="boolean")
expected = DataFrame(
{
"a": pd.array(
[
True,
True,
True,
True,
True,
False,
False,
False,
False,
False,
None,
None,
None,
None,
None,
],
dtype="boolean",
)
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_delimiter_with_usecols_and_parse_dates(all_parsers):
# GH#35873
result = all_parsers.read_csv(
StringIO('"dump","-9,1","-9,1",20101010'),
engine="python",
names=["col", "col1", "col2", "col3"],
usecols=["col1", "col2", "col3"],
parse_dates=["col3"],
decimal=",",
)
expected = DataFrame(
{"col1": [-9.1], "col2": [-9.1], "col3": [Timestamp("2010-10-10")]}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("thousands", ["_", None])
def test_decimal_and_exponential(
request, python_parser_only, numeric_decimal, thousands
):
# GH#31920
decimal_number_check(request, python_parser_only, numeric_decimal, thousands, None)
@pytest.mark.parametrize("thousands", ["_", None])
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
def test_1000_sep_decimal_float_precision(
request, c_parser_only, numeric_decimal, float_precision, thousands
):
# test decimal and thousand sep handling in across 'float_precision'
# parsers
decimal_number_check(
request, c_parser_only, numeric_decimal, thousands, float_precision
)
text, value = numeric_decimal
text = " " + text + " "
if isinstance(value, str): # the negative cases (parse as text)
value = " " + value + " "
decimal_number_check(
request, c_parser_only, (text, value), thousands, float_precision
)
def decimal_number_check(request, parser, numeric_decimal, thousands, float_precision):
# GH#31920
value = numeric_decimal[0]
if thousands is None and value in ("1_,", "1_234,56", "1_234,56e0"):
request.node.add_marker(
pytest.mark.xfail(reason=f"thousands={thousands} and sep is in {value}")
)
df = parser.read_csv(
StringIO(value),
float_precision=float_precision,
sep="|",
thousands=thousands,
decimal=",",
header=None,
)
val = df.iloc[0, 0]
assert val == numeric_decimal[1]
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
def test_skip_whitespace(c_parser_only, float_precision):
DATA = """id\tnum\t
1\t1.2 \t
1\t 2.1\t
2\t 1\t
2\t 1.2 \t
"""
df = c_parser_only.read_csv(
StringIO(DATA),
float_precision=float_precision,
sep="\t",
header=0,
dtype={1: np.float64},
)
tm.assert_series_equal(df.iloc[:, 1], pd.Series([1.2, 2.1, 1.0, 1.2], name="num"))
@pytest.mark.usefixtures("pyarrow_xfail")
def test_true_values_cast_to_bool(all_parsers):
# GH#34655
text = """a,b
yes,xxx
no,yyy
1,zzz
0,aaa
"""
parser = all_parsers
result = parser.read_csv(
StringIO(text),
true_values=["yes"],
false_values=["no"],
dtype={"a": "boolean"},
)
expected = DataFrame(
{"a": [True, False, True, False], "b": ["xxx", "yyy", "zzz", "aaa"]}
)
expected["a"] = expected["a"].astype("boolean")
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
@pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)])
def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
# GH#35211
parser = all_parsers
data = """a,a\n1,1"""
dtype_dict = {"a": str, **dtypes}
# GH#42462
dtype_dict_copy = dtype_dict.copy()
result = parser.read_csv(StringIO(data), dtype=dtype_dict)
expected = DataFrame({"a": ["1"], "a.1": [exp_value]})
assert dtype_dict == dtype_dict_copy, "dtype dict changed"
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_mangle_dup_cols_single_dtype(all_parsers):
# GH#42022
parser = all_parsers
data = """a,a\n1,1"""
result = parser.read_csv(StringIO(data), dtype=str)
expected = DataFrame({"a": ["1"], "a.1": ["1"]})
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_multi_index(all_parsers):
# GH 42446
parser = all_parsers
data = "A,B,B\nX,Y,Z\n1,2,3"
result = parser.read_csv(
StringIO(data),
header=list(range(2)),
dtype={
("A", "X"): np.int32,
("B", "Y"): np.int32,
("B", "Z"): np.float32,
},
)
expected = DataFrame(
{
("A", "X"): np.int32([1]),
("B", "Y"): np.int32([2]),
("B", "Z"): np.float32([3]),
}
)
tm.assert_frame_equal(result, expected)
def test_nullable_int_dtype(all_parsers, any_int_ea_dtype):
# GH 25472
parser = all_parsers
dtype = any_int_ea_dtype
data = """a,b,c
,3,5
1,,6
2,4,"""
expected = DataFrame(
{
"a": pd.array([pd.NA, 1, 2], dtype=dtype),
"b": pd.array([3, pd.NA, 4], dtype=dtype),
"c": pd.array([5, 6, pd.NA], dtype=dtype),
}
)
actual = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(actual, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
@pytest.mark.parametrize("default", ["float", "float64"])
def test_dtypes_defaultdict(all_parsers, default):
# GH#41574
data = """a,b
1,2
"""
dtype = defaultdict(lambda: default, a="int64")
parser = all_parsers
result = parser.read_csv(StringIO(data), dtype=dtype)
expected = DataFrame({"a": [1], "b": 2.0})
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtypes_defaultdict_mangle_dup_cols(all_parsers):
# GH#41574
data = """a,b,a,b,b.1
1,2,3,4,5
"""
dtype = defaultdict(lambda: "float64", a="int64")
dtype["b.1"] = "int64"
parser = all_parsers
result = parser.read_csv(StringIO(data), dtype=dtype)
expected = DataFrame({"a": [1], "b": [2.0], "a.1": [3], "b.2": [4.0], "b.1": [5]})
tm.assert_frame_equal(result, expected)
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtypes_defaultdict_invalid(all_parsers):
# GH#41574
data = """a,b
1,2
"""
dtype = defaultdict(lambda: "invalid_dtype", a="int64")
parser = all_parsers
with pytest.raises(TypeError, match="not understood"):
parser.read_csv(StringIO(data), dtype=dtype)
def test_dtype_backend(all_parsers):
# GH#36712
parser = all_parsers
data = """a,b,c,d,e,f,g,h,i,j
1,2.5,True,a,,,,,12-31-2019,
3,4.5,False,b,6,7.5,True,a,12-31-2019,
"""
result = parser.read_csv(
StringIO(data), dtype_backend="numpy_nullable", parse_dates=["i"]
)
expected = DataFrame(
{
"a": pd.Series([1, 3], dtype="Int64"),
"b": pd.Series([2.5, 4.5], dtype="Float64"),
"c": pd.Series([True, False], dtype="boolean"),
"d": pd.Series(["a", "b"], dtype="string"),
"e": pd.Series([pd.NA, 6], dtype="Int64"),
"f": pd.Series([pd.NA, 7.5], dtype="Float64"),
"g": pd.Series([pd.NA, True], dtype="boolean"),
"h": pd.Series([pd.NA, "a"], dtype="string"),
"i": pd.Series([Timestamp("2019-12-31")] * 2),
"j": pd.Series([pd.NA, pd.NA], dtype="Int64"),
}
)
tm.assert_frame_equal(result, expected)
def test_dtype_backend_and_dtype(all_parsers):
# GH#36712
parser = all_parsers
data = """a,b
1,2.5
,
"""
result = parser.read_csv(
StringIO(data), dtype_backend="numpy_nullable", dtype="float64"
)
expected = DataFrame({"a": [1.0, np.nan], "b": [2.5, np.nan]})
tm.assert_frame_equal(result, expected)
def test_dtype_backend_string(all_parsers, string_storage):
# GH#36712
pa = pytest.importorskip("pyarrow")
with pd.option_context("mode.string_storage", string_storage):
parser = all_parsers
data = """a,b
a,x
b,
"""
result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable")
if string_storage == "python":
expected = DataFrame(
{
"a": StringArray(np.array(["a", "b"], dtype=np.object_)),
"b": StringArray(np.array(["x", pd.NA], dtype=np.object_)),
}
)
else:
expected = DataFrame(
{
"a": ArrowStringArray(pa.array(["a", "b"])),
"b": ArrowStringArray(pa.array(["x", None])),
}
)
tm.assert_frame_equal(result, expected)
def test_dtype_backend_ea_dtype_specified(all_parsers):
# GH#491496
data = """a,b
1,2
"""
parser = all_parsers
result = parser.read_csv(
StringIO(data), dtype="Int64", dtype_backend="numpy_nullable"
)
expected = DataFrame({"a": [1], "b": 2}, dtype="Int64")
tm.assert_frame_equal(result, expected)
def test_dtype_backend_pyarrow(all_parsers, request):
# GH#36712
pa = pytest.importorskip("pyarrow")
parser = all_parsers
data = """a,b,c,d,e,f,g,h,i,j
1,2.5,True,a,,,,,12-31-2019,
3,4.5,False,b,6,7.5,True,a,12-31-2019,
"""
result = parser.read_csv(StringIO(data), dtype_backend="pyarrow", parse_dates=["i"])
expected = DataFrame(
{
"a": pd.Series([1, 3], dtype="int64[pyarrow]"),
"b": pd.Series([2.5, 4.5], dtype="float64[pyarrow]"),
"c": pd.Series([True, False], dtype="bool[pyarrow]"),
"d": pd.Series(["a", "b"], dtype=pd.ArrowDtype(pa.string())),
"e": pd.Series([pd.NA, 6], dtype="int64[pyarrow]"),
"f": pd.Series([pd.NA, 7.5], dtype="float64[pyarrow]"),
"g": pd.Series([pd.NA, True], dtype="bool[pyarrow]"),
"h": pd.Series(
[pd.NA, "a"],
dtype=pd.ArrowDtype(pa.string()),
),
"i": pd.Series([Timestamp("2019-12-31")] * 2),
"j": pd.Series([pd.NA, pd.NA], dtype="null[pyarrow]"),
}
)
tm.assert_frame_equal(result, expected)
def test_ea_int_avoid_overflow(all_parsers):
# GH#32134
parser = all_parsers
data = """a,b
1,1
,1
1582218195625938945,1
"""
result = parser.read_csv(StringIO(data), dtype={"a": "Int64"})
expected = DataFrame(
{
"a": IntegerArray(
np.array([1, 1, 1582218195625938945]), np.array([False, True, False])
),
"b": 1,
}
)
tm.assert_frame_equal(result, expected)
def test_string_inference(all_parsers):
# GH#54430
pytest.importorskip("pyarrow")
dtype = "string[pyarrow_numpy]"
data = """a,b
x,1
y,2
,3"""
parser = all_parsers
with pd.option_context("future.infer_string", True):
result = parser.read_csv(StringIO(data))
expected = DataFrame(
{"a": pd.Series(["x", "y", None], dtype=dtype), "b": [1, 2, 3]},
columns=pd.Index(["a", "b"], dtype=dtype),
)
tm.assert_frame_equal(result, expected)
def test_dtypes_with_usecols(all_parsers):
# GH#54868
parser = all_parsers
data = """a,b,c
1,2,3
4,5,6"""
result = parser.read_csv(StringIO(data), usecols=["a", "c"], dtype={"a": object})
if parser.engine == "pyarrow":
values = [1, 4]
else:
values = ["1", "4"]
expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]})
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,174 @@
"""
Tests dtype specification during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import numpy as np
import pytest
from pandas import (
Categorical,
DataFrame,
Index,
MultiIndex,
Series,
concat,
)
import pandas._testing as tm
# TODO(1.4): Change me into individual xfails at release time
pytestmark = pytest.mark.usefixtures("pyarrow_skip")
def test_dtype_all_columns_empty(all_parsers):
# see gh-12048
parser = all_parsers
result = parser.read_csv(StringIO("A,B"), dtype=str)
expected = DataFrame({"A": [], "B": []}, dtype=str)
tm.assert_frame_equal(result, expected)
def test_empty_pass_dtype(all_parsers):
parser = all_parsers
data = "one,two"
result = parser.read_csv(StringIO(data), dtype={"one": "u1"})
expected = DataFrame(
{"one": np.empty(0, dtype="u1"), "two": np.empty(0, dtype=object)},
)
tm.assert_frame_equal(result, expected)
def test_empty_with_index_pass_dtype(all_parsers):
parser = all_parsers
data = "one,two"
result = parser.read_csv(
StringIO(data), index_col=["one"], dtype={"one": "u1", 1: "f"}
)
expected = DataFrame(
{"two": np.empty(0, dtype="f")}, index=Index([], dtype="u1", name="one")
)
tm.assert_frame_equal(result, expected)
def test_empty_with_multi_index_pass_dtype(all_parsers):
parser = all_parsers
data = "one,two,three"
result = parser.read_csv(
StringIO(data), index_col=["one", "two"], dtype={"one": "u1", 1: "f8"}
)
exp_idx = MultiIndex.from_arrays(
[np.empty(0, dtype="u1"), np.empty(0, dtype=np.float64)],
names=["one", "two"],
)
expected = DataFrame({"three": np.empty(0, dtype=object)}, index=exp_idx)
tm.assert_frame_equal(result, expected)
def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers):
parser = all_parsers
data = "one,one"
result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"})
expected = DataFrame(
{"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
)
tm.assert_frame_equal(result, expected)
def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers):
parser = all_parsers
data = "one,one"
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
expected = DataFrame(
{"one": np.empty(0, dtype="u1"), "one.1": np.empty(0, dtype="f")},
)
tm.assert_frame_equal(result, expected)
def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers):
# see gh-9424
parser = all_parsers
expected = concat(
[Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
axis=1,
)
data = "one,one"
result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"})
tm.assert_frame_equal(result, expected)
def test_empty_with_dup_column_pass_dtype_by_indexes_raises(all_parsers):
# see gh-9424
parser = all_parsers
expected = concat(
[Series([], name="one", dtype="u1"), Series([], name="one.1", dtype="f")],
axis=1,
)
expected.index = expected.index.astype(object)
with pytest.raises(ValueError, match="Duplicate names"):
data = ""
parser.read_csv(StringIO(data), names=["one", "one"], dtype={0: "u1", 1: "f"})
@pytest.mark.parametrize(
"dtype,expected",
[
(np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)),
(
"category",
DataFrame({"a": Categorical([]), "b": Categorical([])}),
),
(
{"a": "category", "b": "category"},
DataFrame({"a": Categorical([]), "b": Categorical([])}),
),
("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")),
(
"timedelta64[ns]",
DataFrame(
{
"a": Series([], dtype="timedelta64[ns]"),
"b": Series([], dtype="timedelta64[ns]"),
},
),
),
(
{"a": np.int64, "b": np.int32},
DataFrame(
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
),
),
(
{0: np.int64, 1: np.int32},
DataFrame(
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
),
),
(
{"a": np.int64, 1: np.int32},
DataFrame(
{"a": Series([], dtype=np.int64), "b": Series([], dtype=np.int32)},
),
),
],
)
def test_empty_dtype(all_parsers, dtype, expected):
# see gh-14712
parser = all_parsers
data = "a,b"
result = parser.read_csv(StringIO(data), header=0, dtype=dtype)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,664 @@
"""
Tests that apply specifically to the CParser. Unless specifically stated
as a CParser-specific issue, the goal is to eventually move as many of
these tests out of this module as soon as the Python parser can accept
further arguments when parsing.
"""
from decimal import Decimal
from io import (
BytesIO,
StringIO,
TextIOWrapper,
)
import mmap
import os
import tarfile
import numpy as np
import pytest
from pandas.compat import is_ci_environment
from pandas.compat.numpy import np_version_gte1p24
from pandas.errors import ParserError
import pandas.util._test_decorators as td
from pandas import (
DataFrame,
concat,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"malformed",
["1\r1\r1\r 1\r 1\r", "1\r1\r1\r 1\r 1\r11\r", "1\r1\r1\r 1\r 1\r11\r1\r"],
ids=["words pointer", "stream pointer", "lines pointer"],
)
def test_buffer_overflow(c_parser_only, malformed):
# see gh-9205: test certain malformed input files that cause
# buffer overflows in tokenizer.c
msg = "Buffer overflow caught - possible malformed input file."
parser = c_parser_only
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(malformed))
def test_delim_whitespace_custom_terminator(c_parser_only):
# See gh-12912
data = "a b c~1 2 3~4 5 6~7 8 9"
parser = c_parser_only
df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True)
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])
tm.assert_frame_equal(df, expected)
def test_dtype_and_names_error(c_parser_only):
# see gh-8833: passing both dtype and names
# resulting in an error reporting issue
parser = c_parser_only
data = """
1.0 1
2.0 2
3.0 3
"""
# base cases
result = parser.read_csv(StringIO(data), sep=r"\s+", header=None)
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]])
tm.assert_frame_equal(result, expected)
result = parser.read_csv(StringIO(data), sep=r"\s+", header=None, names=["a", "b"])
expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
# fallback casting
result = parser.read_csv(
StringIO(data), sep=r"\s+", header=None, names=["a", "b"], dtype={"a": np.int32}
)
expected = DataFrame([[1, 1], [2, 2], [3, 3]], columns=["a", "b"])
expected["a"] = expected["a"].astype(np.int32)
tm.assert_frame_equal(result, expected)
data = """
1.0 1
nan 2
3.0 3
"""
# fallback casting, but not castable
warning = RuntimeWarning if np_version_gte1p24 else None
with pytest.raises(ValueError, match="cannot safely convert"):
with tm.assert_produces_warning(warning, check_stacklevel=False):
parser.read_csv(
StringIO(data),
sep=r"\s+",
header=None,
names=["a", "b"],
dtype={"a": np.int32},
)
@pytest.mark.parametrize(
"match,kwargs",
[
# For each of these cases, all of the dtypes are valid, just unsupported.
(
(
"the dtype datetime64 is not supported for parsing, "
"pass this column using parse_dates instead"
),
{"dtype": {"A": "datetime64", "B": "float64"}},
),
(
(
"the dtype datetime64 is not supported for parsing, "
"pass this column using parse_dates instead"
),
{"dtype": {"A": "datetime64", "B": "float64"}, "parse_dates": ["B"]},
),
(
"the dtype timedelta64 is not supported for parsing",
{"dtype": {"A": "timedelta64", "B": "float64"}},
),
(
f"the dtype {tm.ENDIAN}U8 is not supported for parsing",
{"dtype": {"A": "U8"}},
),
],
ids=["dt64-0", "dt64-1", "td64", f"{tm.ENDIAN}U8"],
)
def test_unsupported_dtype(c_parser_only, match, kwargs):
parser = c_parser_only
df = DataFrame(
np.random.default_rng(2).random((5, 2)),
columns=list("AB"),
index=["1A", "1B", "1C", "1D", "1E"],
)
with tm.ensure_clean("__unsupported_dtype__.csv") as path:
df.to_csv(path)
with pytest.raises(TypeError, match=match):
parser.read_csv(path, index_col=0, **kwargs)
@td.skip_if_32bit
@pytest.mark.slow
def test_precise_conversion(c_parser_only):
parser = c_parser_only
normal_errors = []
precise_errors = []
def error(val: float, actual_val: Decimal) -> Decimal:
return abs(Decimal(f"{val:.100}") - actual_val)
# test numbers between 1 and 2
for num in np.linspace(1.0, 2.0, num=500):
# 25 decimal digits of precision
text = f"a\n{num:.25}"
normal_val = float(
parser.read_csv(StringIO(text), float_precision="legacy")["a"][0]
)
precise_val = float(
parser.read_csv(StringIO(text), float_precision="high")["a"][0]
)
roundtrip_val = float(
parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0]
)
actual_val = Decimal(text[2:])
normal_errors.append(error(normal_val, actual_val))
precise_errors.append(error(precise_val, actual_val))
# round-trip should match float()
assert roundtrip_val == float(text[2:])
assert sum(precise_errors) <= sum(normal_errors)
assert max(precise_errors) <= max(normal_errors)
def test_usecols_dtypes(c_parser_only):
parser = c_parser_only
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
result = parser.read_csv(
StringIO(data),
usecols=(0, 1, 2),
names=("a", "b", "c"),
header=None,
converters={"a": str},
dtype={"b": int, "c": float},
)
result2 = parser.read_csv(
StringIO(data),
usecols=(0, 2),
names=("a", "b", "c"),
header=None,
converters={"a": str},
dtype={"b": int, "c": float},
)
assert (result.dtypes == [object, int, float]).all()
assert (result2.dtypes == [object, float]).all()
def test_disable_bool_parsing(c_parser_only):
# see gh-2090
parser = c_parser_only
data = """A,B,C
Yes,No,Yes
No,Yes,Yes
Yes,,Yes
No,No,No"""
result = parser.read_csv(StringIO(data), dtype=object)
assert (result.dtypes == object).all()
result = parser.read_csv(StringIO(data), dtype=object, na_filter=False)
assert result["B"][2] == ""
def test_custom_lineterminator(c_parser_only):
parser = c_parser_only
data = "a,b,c~1,2,3~4,5,6"
result = parser.read_csv(StringIO(data), lineterminator="~")
expected = parser.read_csv(StringIO(data.replace("~", "\n")))
tm.assert_frame_equal(result, expected)
def test_parse_ragged_csv(c_parser_only):
parser = c_parser_only
data = """1,2,3
1,2,3,4
1,2,3,4,5
1,2
1,2,3,4"""
nice_data = """1,2,3,,
1,2,3,4,
1,2,3,4,5
1,2,,,
1,2,3,4,"""
result = parser.read_csv(
StringIO(data), header=None, names=["a", "b", "c", "d", "e"]
)
expected = parser.read_csv(
StringIO(nice_data), header=None, names=["a", "b", "c", "d", "e"]
)
tm.assert_frame_equal(result, expected)
# too many columns, cause segfault if not careful
data = "1,2\n3,4,5"
result = parser.read_csv(StringIO(data), header=None, names=range(50))
expected = parser.read_csv(StringIO(data), header=None, names=range(3)).reindex(
columns=range(50)
)
tm.assert_frame_equal(result, expected)
def test_tokenize_CR_with_quoting(c_parser_only):
# see gh-3453
parser = c_parser_only
data = ' a,b,c\r"a,b","e,d","f,f"'
result = parser.read_csv(StringIO(data), header=None)
expected = parser.read_csv(StringIO(data.replace("\r", "\n")), header=None)
tm.assert_frame_equal(result, expected)
result = parser.read_csv(StringIO(data))
expected = parser.read_csv(StringIO(data.replace("\r", "\n")))
tm.assert_frame_equal(result, expected)
@pytest.mark.slow
def test_grow_boundary_at_cap(c_parser_only):
# See gh-12494
#
# Cause of error was that the C parser
# was not increasing the buffer size when
# the desired space would fill the buffer
# to capacity, which would later cause a
# buffer overflow error when checking the
# EOF terminator of the CSV stream.
parser = c_parser_only
def test_empty_header_read(count):
with StringIO("," * count) as s:
expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)])
df = parser.read_csv(s)
tm.assert_frame_equal(df, expected)
for cnt in range(1, 101):
test_empty_header_read(cnt)
def test_parse_trim_buffers(c_parser_only):
# This test is part of a bugfix for gh-13703. It attempts to
# to stress the system memory allocator, to cause it to move the
# stream buffer and either let the OS reclaim the region, or let
# other memory requests of parser otherwise modify the contents
# of memory space, where it was formally located.
# This test is designed to cause a `segfault` with unpatched
# `tokenizer.c`. Sometimes the test fails on `segfault`, other
# times it fails due to memory corruption, which causes the
# loaded DataFrame to differ from the expected one.
parser = c_parser_only
# Generate a large mixed-type CSV file on-the-fly (one record is
# approx 1.5KiB).
record_ = (
"""9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z"""
"""ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,"""
"""ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9"""
"""99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,"""
"""9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9."""
"""99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999."""
"""99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ"""
"""ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ"""
"""ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z"""
"""ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,"""
"""9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,"""
"""999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,"""
""",,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999"""
""",9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9."""
"""999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,"""
""",9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z"""
"""ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ"""
""",999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99"""
""",,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-"""
"""9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9"""
""".99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,"""
""",,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9."""
"""99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ"""
"""ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ"""
"""-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ"""
"""ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ"""
""",9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99"""
""",99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9"""
""".99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
)
# Set the number of lines so that a call to `parser_trim_buffers`
# is triggered: after a couple of full chunks are consumed a
# relatively small 'residual' chunk would cause reallocation
# within the parser.
chunksize, n_lines = 128, 2 * 128 + 15
csv_data = "\n".join([record_] * n_lines) + "\n"
# We will use StringIO to load the CSV from this text buffer.
# pd.read_csv() will iterate over the file in chunks and will
# finally read a residual chunk of really small size.
# Generate the expected output: manually create the dataframe
# by splitting by comma and repeating the `n_lines` times.
row = tuple(val_ if val_ else np.nan for val_ in record_.split(","))
expected = DataFrame(
[row for _ in range(n_lines)], dtype=object, columns=None, index=None
)
# Iterate over the CSV file in chunks of `chunksize` lines
with parser.read_csv(
StringIO(csv_data), header=None, dtype=object, chunksize=chunksize
) as chunks_:
result = concat(chunks_, axis=0, ignore_index=True)
# Check for data corruption if there was no segfault
tm.assert_frame_equal(result, expected)
# This extra test was added to replicate the fault in gh-5291.
# Force 'utf-8' encoding, so that `_string_convert` would take
# a different execution branch.
with parser.read_csv(
StringIO(csv_data),
header=None,
dtype=object,
chunksize=chunksize,
encoding="utf_8",
) as chunks_:
result = concat(chunks_, axis=0, ignore_index=True)
tm.assert_frame_equal(result, expected)
def test_internal_null_byte(c_parser_only):
# see gh-14012
#
# The null byte ('\x00') should not be used as a
# true line terminator, escape character, or comment
# character, only as a placeholder to indicate that
# none was specified.
#
# This test should be moved to test_common.py ONLY when
# Python's csv class supports parsing '\x00'.
parser = c_parser_only
names = ["a", "b", "c"]
data = "1,2,3\n4,\x00,6\n7,8,9"
expected = DataFrame([[1, 2.0, 3], [4, np.nan, 6], [7, 8, 9]], columns=names)
result = parser.read_csv(StringIO(data), names=names)
tm.assert_frame_equal(result, expected)
def test_read_nrows_large(c_parser_only):
# gh-7626 - Read only nrows of data in for large inputs (>262144b)
parser = c_parser_only
header_narrow = "\t".join(["COL_HEADER_" + str(i) for i in range(10)]) + "\n"
data_narrow = "\t".join(["somedatasomedatasomedata1" for _ in range(10)]) + "\n"
header_wide = "\t".join(["COL_HEADER_" + str(i) for i in range(15)]) + "\n"
data_wide = "\t".join(["somedatasomedatasomedata2" for _ in range(15)]) + "\n"
test_input = header_narrow + data_narrow * 1050 + header_wide + data_wide * 2
df = parser.read_csv(StringIO(test_input), sep="\t", nrows=1010)
assert df.size == 1010 * 10
def test_float_precision_round_trip_with_text(c_parser_only):
# see gh-15140
parser = c_parser_only
df = parser.read_csv(StringIO("a"), header=None, float_precision="round_trip")
tm.assert_frame_equal(df, DataFrame({0: ["a"]}))
def test_large_difference_in_columns(c_parser_only):
# see gh-14125
parser = c_parser_only
count = 10000
large_row = ("X," * count)[:-1] + "\n"
normal_row = "XXXXXX XXXXXX,111111111111111\n"
test_input = (large_row + normal_row * 6)[:-1]
result = parser.read_csv(StringIO(test_input), header=None, usecols=[0])
rows = test_input.split("\n")
expected = DataFrame([row.split(",")[0] for row in rows])
tm.assert_frame_equal(result, expected)
def test_data_after_quote(c_parser_only):
# see gh-15910
parser = c_parser_only
data = 'a\n1\n"b"a'
result = parser.read_csv(StringIO(data))
expected = DataFrame({"a": ["1", "ba"]})
tm.assert_frame_equal(result, expected)
def test_comment_whitespace_delimited(c_parser_only, capsys):
parser = c_parser_only
test_input = """\
1 2
2 2 3
3 2 3 # 3 fields
4 2 3# 3 fields
5 2 # 2 fields
6 2# 2 fields
7 # 1 field, NaN
8# 1 field, NaN
9 2 3 # skipped line
# comment"""
df = parser.read_csv(
StringIO(test_input),
comment="#",
header=None,
delimiter="\\s+",
skiprows=0,
on_bad_lines="warn",
)
captured = capsys.readouterr()
# skipped lines 2, 3, 4, 9
for line_num in (2, 3, 4, 9):
assert f"Skipping line {line_num}" in captured.err
expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]])
tm.assert_frame_equal(df, expected)
def test_file_like_no_next(c_parser_only):
# gh-16530: the file-like need not have a "next" or "__next__"
# attribute despite having an "__iter__" attribute.
#
# NOTE: This is only true for the C engine, not Python engine.
class NoNextBuffer(StringIO):
def __next__(self):
raise AttributeError("No next method")
next = __next__
parser = c_parser_only
data = "a\n1"
expected = DataFrame({"a": [1]})
result = parser.read_csv(NoNextBuffer(data))
tm.assert_frame_equal(result, expected)
def test_buffer_rd_bytes_bad_unicode(c_parser_only):
# see gh-22748
t = BytesIO(b"\xB0")
t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape")
msg = "'utf-8' codec can't encode character"
with pytest.raises(UnicodeError, match=msg):
c_parser_only.read_csv(t, encoding="UTF-8")
@pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"])
def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix):
# see gh-16530
#
# Unfortunately, Python's CSV library can't handle
# tarfile objects (expects string, not bytes when
# iterating through a file-like).
parser = c_parser_only
tar_path = os.path.join(csv_dir_path, "tar_csv" + tar_suffix)
with tarfile.open(tar_path, "r") as tar:
data_file = tar.extractfile("tar_data.csv")
out = parser.read_csv(data_file)
expected = DataFrame({"a": [1]})
tm.assert_frame_equal(out, expected)
@pytest.mark.single_cpu
@pytest.mark.skipif(is_ci_environment(), reason="Too memory intensive for CI.")
def test_bytes_exceed_2gb(c_parser_only):
# see gh-16798
#
# Read from a "CSV" that has a column larger than 2GB.
parser = c_parser_only
if parser.low_memory:
pytest.skip("not a low_memory test")
# csv takes 10 seconds to construct, spikes memory to 8GB+, the whole test
# spikes up to 10.4GB on the c_high case
csv = StringIO("strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)]))
df = parser.read_csv(csv)
assert not df.empty
def test_chunk_whitespace_on_boundary(c_parser_only):
# see gh-9735: this issue is C parser-specific (bug when
# parsing whitespace and characters at chunk boundary)
#
# This test case has a field too large for the Python parser / CSV library.
parser = c_parser_only
chunk1 = "a" * (1024 * 256 - 2) + "\na"
chunk2 = "\n a"
result = parser.read_csv(StringIO(chunk1 + chunk2), header=None)
expected = DataFrame(["a" * (1024 * 256 - 2), "a", " a"])
tm.assert_frame_equal(result, expected)
def test_file_handles_mmap(c_parser_only, csv1):
# gh-14418
#
# Don't close user provided file handles.
parser = c_parser_only
with open(csv1, encoding="utf-8") as f:
with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m:
parser.read_csv(m)
assert not m.closed
def test_file_binary_mode(c_parser_only):
# see gh-23779
parser = c_parser_only
expected = DataFrame([[1, 2, 3], [4, 5, 6]])
with tm.ensure_clean() as path:
with open(path, "w", encoding="utf-8") as f:
f.write("1,2,3\n4,5,6")
with open(path, "rb") as f:
result = parser.read_csv(f, header=None)
tm.assert_frame_equal(result, expected)
def test_unix_style_breaks(c_parser_only):
# GH 11020
parser = c_parser_only
with tm.ensure_clean() as path:
with open(path, "w", newline="\n", encoding="utf-8") as f:
f.write("blah\n\ncol_1,col_2,col_3\n\n")
result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c")
expected = DataFrame(columns=["col_1", "col_2", "col_3"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
@pytest.mark.parametrize(
"data,thousands,decimal",
[
(
"""A|B|C
1|2,334.01|5
10|13|10.
""",
",",
".",
),
(
"""A|B|C
1|2.334,01|5
10|13|10,
""",
".",
",",
),
],
)
def test_1000_sep_with_decimal(
c_parser_only, data, thousands, decimal, float_precision
):
parser = c_parser_only
expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
result = parser.read_csv(
StringIO(data),
sep="|",
thousands=thousands,
decimal=decimal,
float_precision=float_precision,
)
tm.assert_frame_equal(result, expected)
def test_float_precision_options(c_parser_only):
# GH 17154, 36228
parser = c_parser_only
s = "foo\n243.164\n"
df = parser.read_csv(StringIO(s))
df2 = parser.read_csv(StringIO(s), float_precision="high")
tm.assert_frame_equal(df, df2)
df3 = parser.read_csv(StringIO(s), float_precision="legacy")
assert not df.iloc[0, 0] == df3.iloc[0, 0]
msg = "Unrecognized float_precision option: junk"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(s), float_precision="junk")

View File

@@ -0,0 +1,168 @@
"""
Tests that comments are properly handled during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import numpy as np
import pytest
from pandas import DataFrame
import pandas._testing as tm
pytestmark = pytest.mark.usefixtures("pyarrow_skip")
@pytest.mark.parametrize("na_values", [None, ["NaN"]])
def test_comment(all_parsers, na_values):
parser = all_parsers
data = """A,B,C
1,2.,4.#hello world
5.,NaN,10.0
"""
expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
)
result = parser.read_csv(StringIO(data), comment="#", na_values=na_values)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"read_kwargs", [{}, {"lineterminator": "*"}, {"delim_whitespace": True}]
)
def test_line_comment(all_parsers, read_kwargs, request):
parser = all_parsers
data = """# empty
A,B,C
1,2.,4.#hello world
#ignore this line
5.,NaN,10.0
"""
if read_kwargs.get("delim_whitespace"):
data = data.replace(",", " ")
elif read_kwargs.get("lineterminator"):
if parser.engine != "c":
mark = pytest.mark.xfail(
reason="Custom terminator not supported with Python engine"
)
request.node.add_marker(mark)
data = data.replace("\n", read_kwargs.get("lineterminator"))
read_kwargs["comment"] = "#"
result = parser.read_csv(StringIO(data), **read_kwargs)
expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
)
tm.assert_frame_equal(result, expected)
def test_comment_skiprows(all_parsers):
parser = all_parsers
data = """# empty
random line
# second empty line
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0
"""
# This should ignore the first four lines (including comments).
expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
)
result = parser.read_csv(StringIO(data), comment="#", skiprows=4)
tm.assert_frame_equal(result, expected)
def test_comment_header(all_parsers):
parser = all_parsers
data = """# empty
# second empty line
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0
"""
# Header should begin at the second non-comment line.
expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
)
result = parser.read_csv(StringIO(data), comment="#", header=1)
tm.assert_frame_equal(result, expected)
def test_comment_skiprows_header(all_parsers):
parser = all_parsers
data = """# empty
# second empty line
# third empty line
X,Y,Z
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0
"""
# Skiprows should skip the first 4 lines (including comments),
# while header should start from the second non-commented line,
# starting with line 5.
expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
)
result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("comment_char", ["#", "~", "&", "^", "*", "@"])
def test_custom_comment_char(all_parsers, comment_char):
parser = all_parsers
data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo"
result = parser.read_csv(
StringIO(data.replace("#", comment_char)), comment=comment_char
)
expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("header", ["infer", None])
def test_comment_first_line(all_parsers, header):
# see gh-4623
parser = all_parsers
data = "# notes\na,b,c\n# more notes\n1,2,3"
if header is None:
expected = DataFrame({0: ["a", "1"], 1: ["b", "2"], 2: ["c", "3"]})
else:
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
result = parser.read_csv(StringIO(data), comment="#", header=header)
tm.assert_frame_equal(result, expected)
def test_comment_char_in_default_value(all_parsers, request):
# GH#34002
if all_parsers.engine == "c":
reason = "see gh-34002: works on the python engine but not the c engine"
# NA value containing comment char is interpreted as comment
request.node.add_marker(pytest.mark.xfail(reason=reason, raises=AssertionError))
parser = all_parsers
data = (
"# this is a comment\n"
"col1,col2,col3,col4\n"
"1,2,3,4#inline comment\n"
"4,5#,6,10\n"
"7,8,#N/A,11\n"
)
result = parser.read_csv(StringIO(data), comment="#", na_values="#N/A")
expected = DataFrame(
{
"col1": [1, 4, 7],
"col2": [2, 5, 8],
"col3": [3.0, np.nan, np.nan],
"col4": [4.0, np.nan, 11.0],
}
)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,219 @@
"""
Tests compressed data parsing functionality for all
of the parsers defined in parsers.py
"""
import os
from pathlib import Path
import tarfile
import zipfile
import pytest
from pandas import DataFrame
import pandas._testing as tm
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@pytest.fixture(params=[True, False])
def buffer(request):
return request.param
@pytest.fixture
def parser_and_data(all_parsers, csv1):
parser = all_parsers
with open(csv1, "rb") as f:
data = f.read()
expected = parser.read_csv(csv1)
return parser, data, expected
@skip_pyarrow
@pytest.mark.parametrize("compression", ["zip", "infer", "zip2"])
def test_zip(parser_and_data, compression):
parser, data, expected = parser_and_data
with tm.ensure_clean("test_file.zip") as path:
with zipfile.ZipFile(path, mode="w") as tmp:
tmp.writestr("test_file", data)
if compression == "zip2":
with open(path, "rb") as f:
result = parser.read_csv(f, compression="zip")
else:
result = parser.read_csv(path, compression=compression)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize("compression", ["zip", "infer"])
def test_zip_error_multiple_files(parser_and_data, compression):
parser, data, expected = parser_and_data
with tm.ensure_clean("combined_zip.zip") as path:
inner_file_names = ["test_file", "second_file"]
with zipfile.ZipFile(path, mode="w") as tmp:
for file_name in inner_file_names:
tmp.writestr(file_name, data)
with pytest.raises(ValueError, match="Multiple files"):
parser.read_csv(path, compression=compression)
@skip_pyarrow
def test_zip_error_no_files(parser_and_data):
parser, _, _ = parser_and_data
with tm.ensure_clean() as path:
with zipfile.ZipFile(path, mode="w"):
pass
with pytest.raises(ValueError, match="Zero files"):
parser.read_csv(path, compression="zip")
@skip_pyarrow
def test_zip_error_invalid_zip(parser_and_data):
parser, _, _ = parser_and_data
with tm.ensure_clean() as path:
with open(path, "rb") as f:
with pytest.raises(zipfile.BadZipFile, match="File is not a zip file"):
parser.read_csv(f, compression="zip")
@skip_pyarrow
@pytest.mark.parametrize("filename", [None, "test.{ext}"])
def test_compression(
request,
parser_and_data,
compression_only,
buffer,
filename,
compression_to_extension,
):
parser, data, expected = parser_and_data
compress_type = compression_only
ext = compression_to_extension[compress_type]
filename = filename if filename is None else filename.format(ext=ext)
if filename and buffer:
request.node.add_marker(
pytest.mark.xfail(
reason="Cannot deduce compression from buffer of compressed data."
)
)
with tm.ensure_clean(filename=filename) as path:
tm.write_to_compressed(compress_type, path, data)
compression = "infer" if filename else compress_type
if buffer:
with open(path, "rb") as f:
result = parser.read_csv(f, compression=compression)
else:
result = parser.read_csv(path, compression=compression)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize("ext", [None, "gz", "bz2"])
def test_infer_compression(all_parsers, csv1, buffer, ext):
# see gh-9770
parser = all_parsers
kwargs = {"index_col": 0, "parse_dates": True}
expected = parser.read_csv(csv1, **kwargs)
kwargs["compression"] = "infer"
if buffer:
with open(csv1, encoding="utf-8") as f:
result = parser.read_csv(f, **kwargs)
else:
ext = "." + ext if ext else ""
result = parser.read_csv(csv1 + ext, **kwargs)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt):
# see gh-18071, gh-24130
parser = all_parsers
encoding = encoding_fmt.format(utf_value)
path = os.path.join(csv_dir_path, f"utf{utf_value}_ex_small.zip")
result = parser.read_csv(path, encoding=encoding, compression="zip", sep="\t")
expected = DataFrame(
{
"Country": ["Venezuela", "Venezuela"],
"Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."],
}
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"])
def test_invalid_compression(all_parsers, invalid_compression):
parser = all_parsers
compress_kwargs = {"compression": invalid_compression}
msg = f"Unrecognized compression type: {invalid_compression}"
with pytest.raises(ValueError, match=msg):
parser.read_csv("test_file.zip", **compress_kwargs)
@skip_pyarrow
def test_compression_tar_archive(all_parsers, csv_dir_path):
parser = all_parsers
path = os.path.join(csv_dir_path, "tar_csv.tar.gz")
df = parser.read_csv(path)
assert list(df.columns) == ["a"]
def test_ignore_compression_extension(all_parsers):
parser = all_parsers
df = DataFrame({"a": [0, 1]})
with tm.ensure_clean("test.csv") as path_csv:
with tm.ensure_clean("test.csv.zip") as path_zip:
# make sure to create un-compressed file with zip extension
df.to_csv(path_csv, index=False)
Path(path_zip).write_text(
Path(path_csv).read_text(encoding="utf-8"), encoding="utf-8"
)
tm.assert_frame_equal(parser.read_csv(path_zip, compression=None), df)
@skip_pyarrow
def test_writes_tar_gz(all_parsers):
parser = all_parsers
data = DataFrame(
{
"Country": ["Venezuela", "Venezuela"],
"Twitter": ["Hugo Chávez Frías", "Henrique Capriles R."],
}
)
with tm.ensure_clean("test.tar.gz") as tar_path:
data.to_csv(tar_path, index=False)
# test that read_csv infers .tar.gz to gzip:
tm.assert_frame_equal(parser.read_csv(tar_path), data)
# test that file is indeed gzipped:
with tarfile.open(tar_path, "r:gz") as tar:
result = parser.read_csv(
tar.extractfile(tar.getnames()[0]), compression="infer"
)
tm.assert_frame_equal(result, data)

View File

@@ -0,0 +1,36 @@
import numpy as np
import pytest
from pandas.errors import DtypeWarning
import pandas._testing as tm
from pandas.core.arrays import ArrowExtensionArray
from pandas.io.parsers.c_parser_wrapper import _concatenate_chunks
def test_concatenate_chunks_pyarrow():
# GH#51876
pa = pytest.importorskip("pyarrow")
chunks = [
{0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
{0: ArrowExtensionArray(pa.array([1, 2]))},
]
result = _concatenate_chunks(chunks)
expected = ArrowExtensionArray(pa.array([1.5, 2.5, 1.0, 2.0]))
tm.assert_extension_array_equal(result[0], expected)
def test_concatenate_chunks_pyarrow_strings():
# GH#51876
pa = pytest.importorskip("pyarrow")
chunks = [
{0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
{0: ArrowExtensionArray(pa.array(["a", "b"]))},
]
with tm.assert_produces_warning(DtypeWarning, match="have mixed types"):
result = _concatenate_chunks(chunks)
expected = np.concatenate(
[np.array([1.5, 2.5], dtype=object), np.array(["a", "b"])]
)
tm.assert_numpy_array_equal(result[0], expected)

View File

@@ -0,0 +1,203 @@
"""
Tests column conversion functionality during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
from dateutil.parser import parse
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm
pytestmark = pytest.mark.usefixtures("pyarrow_skip")
def test_converters_type_must_be_dict(all_parsers):
parser = all_parsers
data = """index,A,B,C,D
foo,2,3,4,5
"""
with pytest.raises(TypeError, match="Type converters.+"):
parser.read_csv(StringIO(data), converters=0)
@pytest.mark.parametrize("column", [3, "D"])
@pytest.mark.parametrize(
"converter", [parse, lambda x: int(x.split("/")[2])] # Produce integer.
)
def test_converters(all_parsers, column, converter):
parser = all_parsers
data = """A,B,C,D
a,1,2,01/01/2009
b,3,4,01/02/2009
c,4,5,01/03/2009
"""
result = parser.read_csv(StringIO(data), converters={column: converter})
expected = parser.read_csv(StringIO(data))
expected["D"] = expected["D"].map(converter)
tm.assert_frame_equal(result, expected)
def test_converters_no_implicit_conv(all_parsers):
# see gh-2184
parser = all_parsers
data = """000102,1.2,A\n001245,2,B"""
converters = {0: lambda x: x.strip()}
result = parser.read_csv(StringIO(data), header=None, converters=converters)
# Column 0 should not be casted to numeric and should remain as object.
expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]])
tm.assert_frame_equal(result, expected)
def test_converters_euro_decimal_format(all_parsers):
# see gh-583
converters = {}
parser = all_parsers
data = """Id;Number1;Number2;Text1;Text2;Number3
1;1521,1541;187101,9543;ABC;poi;4,7387
2;121,12;14897,76;DEF;uyt;0,3773
3;878,158;108013,434;GHI;rez;2,7356"""
converters["Number1"] = converters["Number2"] = converters[
"Number3"
] = lambda x: float(x.replace(",", "."))
result = parser.read_csv(StringIO(data), sep=";", converters=converters)
expected = DataFrame(
[
[1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387],
[2, 121.12, 14897.76, "DEF", "uyt", 0.3773],
[3, 878.158, 108013.434, "GHI", "rez", 2.7356],
],
columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
)
tm.assert_frame_equal(result, expected)
def test_converters_corner_with_nans(all_parsers):
parser = all_parsers
data = """id,score,days
1,2,12
2,2-5,
3,,14+
4,6-12,2"""
# Example converters.
def convert_days(x):
x = x.strip()
if not x:
return np.nan
is_plus = x.endswith("+")
if is_plus:
x = int(x[:-1]) + 1
else:
x = int(x)
return x
def convert_days_sentinel(x):
x = x.strip()
if not x:
return np.nan
is_plus = x.endswith("+")
if is_plus:
x = int(x[:-1]) + 1
else:
x = int(x)
return x
def convert_score(x):
x = x.strip()
if not x:
return np.nan
if x.find("-") > 0:
val_min, val_max = map(int, x.split("-"))
val = 0.5 * (val_min + val_max)
else:
val = float(x)
return val
results = []
for day_converter in [convert_days, convert_days_sentinel]:
result = parser.read_csv(
StringIO(data),
converters={"score": convert_score, "days": day_converter},
na_values=["", None],
)
assert pd.isna(result["days"][1])
results.append(result)
tm.assert_frame_equal(results[0], results[1])
@pytest.mark.parametrize("conv_f", [lambda x: x, str])
def test_converter_index_col_bug(all_parsers, conv_f):
# see gh-1835 , GH#40589
parser = all_parsers
data = "A;B\n1;2\n3;4"
rs = parser.read_csv(
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
)
xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object"))
tm.assert_frame_equal(rs, xp)
def test_converter_identity_object(all_parsers):
# GH#40589
parser = all_parsers
data = "A,B\n1,2\n3,4"
rs = parser.read_csv(StringIO(data), converters={"A": lambda x: x})
xp = DataFrame({"A": ["1", "3"], "B": [2, 4]})
tm.assert_frame_equal(rs, xp)
def test_converter_multi_index(all_parsers):
# GH 42446
parser = all_parsers
data = "A,B,B\nX,Y,Z\n1,2,3"
result = parser.read_csv(
StringIO(data),
header=list(range(2)),
converters={
("A", "X"): np.int32,
("B", "Y"): np.int32,
("B", "Z"): np.float32,
},
)
expected = DataFrame(
{
("A", "X"): np.int32([1]),
("B", "Y"): np.int32([2]),
("B", "Z"): np.float32([3]),
}
)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,156 @@
"""
Tests that dialects are properly handled during parsing
for all of the parsers defined in parsers.py
"""
import csv
from io import StringIO
import pytest
from pandas.errors import ParserWarning
from pandas import DataFrame
import pandas._testing as tm
pytestmark = pytest.mark.usefixtures("pyarrow_skip")
@pytest.fixture
def custom_dialect():
dialect_name = "weird"
dialect_kwargs = {
"doublequote": False,
"escapechar": "~",
"delimiter": ":",
"skipinitialspace": False,
"quotechar": "~",
"quoting": 3,
}
return dialect_name, dialect_kwargs
def test_dialect(all_parsers):
parser = all_parsers
data = """\
label1,label2,label3
index1,"a,c,e
index2,b,d,f
"""
dia = csv.excel()
dia.quoting = csv.QUOTE_NONE
df = parser.read_csv(StringIO(data), dialect=dia)
data = """\
label1,label2,label3
index1,a,c,e
index2,b,d,f
"""
exp = parser.read_csv(StringIO(data))
exp.replace("a", '"a', inplace=True)
tm.assert_frame_equal(df, exp)
def test_dialect_str(all_parsers):
dialect_name = "mydialect"
parser = all_parsers
data = """\
fruit:vegetable
apple:broccoli
pear:tomato
"""
exp = DataFrame({"fruit": ["apple", "pear"], "vegetable": ["broccoli", "tomato"]})
with tm.with_csv_dialect(dialect_name, delimiter=":"):
df = parser.read_csv(StringIO(data), dialect=dialect_name)
tm.assert_frame_equal(df, exp)
def test_invalid_dialect(all_parsers):
class InvalidDialect:
pass
data = "a\n1"
parser = all_parsers
msg = "Invalid dialect"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), dialect=InvalidDialect)
@pytest.mark.parametrize(
"arg",
[None, "doublequote", "escapechar", "skipinitialspace", "quotechar", "quoting"],
)
@pytest.mark.parametrize("value", ["dialect", "default", "other"])
def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, arg, value):
# see gh-23761.
dialect_name, dialect_kwargs = custom_dialect
parser = all_parsers
expected = DataFrame({"a": [1], "b": [2]})
data = "a:b\n1:2"
warning_klass = None
kwds = {}
# arg=None tests when we pass in the dialect without any other arguments.
if arg is not None:
if value == "dialect": # No conflict --> no warning.
kwds[arg] = dialect_kwargs[arg]
elif value == "default": # Default --> no warning.
from pandas.io.parsers.base_parser import parser_defaults
kwds[arg] = parser_defaults[arg]
else: # Non-default + conflict with dialect --> warning.
warning_klass = ParserWarning
kwds[arg] = "blah"
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
result = parser.read_csv_check_warnings(
warning_klass,
"Conflicting values for",
StringIO(data),
dialect=dialect_name,
**kwds,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,warning_klass",
[
({"sep": ","}, None), # sep is default --> sep_override=True
({"sep": "."}, ParserWarning), # sep isn't default --> sep_override=False
({"delimiter": ":"}, None), # No conflict
({"delimiter": None}, None), # Default arguments --> sep_override=True
({"delimiter": ","}, ParserWarning), # Conflict
({"delimiter": "."}, ParserWarning), # Conflict
],
ids=[
"sep-override-true",
"sep-override-false",
"delimiter-no-conflict",
"delimiter-default-arg",
"delimiter-conflict",
"delimiter-conflict2",
],
)
def test_dialect_conflict_delimiter(all_parsers, custom_dialect, kwargs, warning_klass):
# see gh-23761.
dialect_name, dialect_kwargs = custom_dialect
parser = all_parsers
expected = DataFrame({"a": [1], "b": [2]})
data = "a:b\n1:2"
with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
result = parser.read_csv_check_warnings(
warning_klass,
"Conflicting values for 'delimiter'",
StringIO(data),
dialect=dialect_name,
**kwargs,
)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,318 @@
"""
Tests encoding functionality during parsing
for all of the parsers defined in parsers.py
"""
from io import (
BytesIO,
TextIOWrapper,
)
import os
import tempfile
import uuid
import numpy as np
import pytest
from pandas import (
DataFrame,
read_csv,
)
import pandas._testing as tm
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
def test_bytes_io_input(all_parsers):
encoding = "cp1255"
parser = all_parsers
data = BytesIO("שלום:1234\n562:123".encode(encoding))
result = parser.read_csv(data, sep=":", encoding=encoding)
expected = DataFrame([[562, 123]], columns=["שלום", "1234"])
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_read_csv_unicode(all_parsers):
parser = all_parsers
data = BytesIO("\u0141aski, Jan;1".encode())
result = parser.read_csv(data, sep=";", encoding="utf-8", header=None)
expected = DataFrame([["\u0141aski, Jan", 1]])
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
@pytest.mark.parametrize("sep", [",", "\t"])
@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"])
def test_utf16_bom_skiprows(all_parsers, sep, encoding):
# see gh-2298
parser = all_parsers
data = """skip this
skip this too
A,B,C
1,2,3
4,5,6""".replace(
",", sep
)
path = f"__{uuid.uuid4()}__.csv"
kwargs = {"sep": sep, "skiprows": 2}
utf8 = "utf-8"
with tm.ensure_clean(path) as path:
bytes_data = data.encode(encoding)
with open(path, "wb") as f:
f.write(bytes_data)
with TextIOWrapper(BytesIO(data.encode(utf8)), encoding=utf8) as bytes_buffer:
result = parser.read_csv(path, encoding=encoding, **kwargs)
expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs)
tm.assert_frame_equal(result, expected)
def test_utf16_example(all_parsers, csv_dir_path):
path = os.path.join(csv_dir_path, "utf16_ex.txt")
parser = all_parsers
result = parser.read_csv(path, encoding="utf-16", sep="\t")
assert len(result) == 50
def test_unicode_encoding(all_parsers, csv_dir_path):
path = os.path.join(csv_dir_path, "unicode_series.csv")
parser = all_parsers
result = parser.read_csv(path, header=None, encoding="latin-1")
result = result.set_index(0)
got = result[1][1632]
expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)"
assert got == expected
@pytest.mark.parametrize(
"data,kwargs,expected",
[
# Basic test
("a\n1", {}, DataFrame({"a": [1]})),
# "Regular" quoting
('"a"\n1', {"quotechar": '"'}, DataFrame({"a": [1]})),
# Test in a data row instead of header
("b\n1", {"names": ["a"]}, DataFrame({"a": ["b", "1"]})),
# Test in empty data row with skipping
("\n1", {"names": ["a"], "skip_blank_lines": True}, DataFrame({"a": [1]})),
# Test in empty data row without skipping
(
"\n1",
{"names": ["a"], "skip_blank_lines": False},
DataFrame({"a": [np.nan, 1]}),
),
],
)
def test_utf8_bom(all_parsers, data, kwargs, expected, request):
# see gh-4793
parser = all_parsers
bom = "\ufeff"
utf8 = "utf-8"
def _encode_data_with_bom(_data):
bom_data = (bom + _data).encode(utf8)
return BytesIO(bom_data)
if (
parser.engine == "pyarrow"
and data == "\n1"
and kwargs.get("skip_blank_lines", True)
):
# Manually xfail, since we don't have mechanism to xfail specific version
request.node.add_marker(
pytest.mark.xfail(reason="Pyarrow can't read blank lines")
)
result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs)
tm.assert_frame_equal(result, expected)
def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt):
# see gh-13549
expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]})
parser = all_parsers
encoding = encoding_fmt.format(utf_value)
data = "mb_num,multibyte\n4.8,test".encode(encoding)
result = parser.read_csv(BytesIO(data), encoding=encoding)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"file_path,encoding",
[
(("io", "data", "csv", "test1.csv"), "utf-8"),
(("io", "parser", "data", "unicode_series.csv"), "latin-1"),
(("io", "parser", "data", "sauron.SHIFT_JIS.csv"), "shiftjis"),
],
)
def test_binary_mode_file_buffers(all_parsers, file_path, encoding, datapath):
# gh-23779: Python csv engine shouldn't error on files opened in binary.
# gh-31575: Python csv engine shouldn't error on files opened in raw binary.
parser = all_parsers
fpath = datapath(*file_path)
expected = parser.read_csv(fpath, encoding=encoding)
with open(fpath, encoding=encoding) as fa:
result = parser.read_csv(fa)
assert not fa.closed
tm.assert_frame_equal(expected, result)
with open(fpath, mode="rb") as fb:
result = parser.read_csv(fb, encoding=encoding)
assert not fb.closed
tm.assert_frame_equal(expected, result)
with open(fpath, mode="rb", buffering=0) as fb:
result = parser.read_csv(fb, encoding=encoding)
assert not fb.closed
tm.assert_frame_equal(expected, result)
@skip_pyarrow
@pytest.mark.parametrize("pass_encoding", [True, False])
def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding):
# see gh-24130
parser = all_parsers
encoding = encoding_fmt.format(utf_value)
expected = DataFrame({"foo": ["bar"]})
with tm.ensure_clean(mode="w+", encoding=encoding, return_filelike=True) as f:
f.write("foo\nbar")
f.seek(0)
result = parser.read_csv(f, encoding=encoding if pass_encoding else None)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_encoding_named_temp_file(all_parsers):
# see gh-31819
parser = all_parsers
encoding = "shift-jis"
title = "てすと"
data = "こむ"
expected = DataFrame({title: [data]})
with tempfile.NamedTemporaryFile() as f:
f.write(f"{title}\n{data}".encode(encoding))
f.seek(0)
result = parser.read_csv(f, encoding=encoding)
tm.assert_frame_equal(result, expected)
assert not f.closed
@pytest.mark.parametrize(
"encoding", ["utf-8", "utf-16", "utf-16-be", "utf-16-le", "utf-32"]
)
def test_parse_encoded_special_characters(encoding):
# GH16218 Verify parsing of data with encoded special characters
# Data contains a Unicode 'FULLWIDTH COLON' (U+FF1A) at position (0,"a")
data = "a\tb\nfoo\t0\nbar\t1\nbaz\t2" # noqa: RUF001
encoded_data = BytesIO(data.encode(encoding))
result = read_csv(encoded_data, delimiter="\t", encoding=encoding)
expected = DataFrame(
data=[["foo", 0], ["bar", 1], ["baz", 2]], # noqa: RUF001
columns=["a", "b"],
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
@pytest.mark.parametrize("encoding", ["utf-8", None, "utf-16", "cp1255", "latin-1"])
def test_encoding_memory_map(all_parsers, encoding):
# GH40986
parser = all_parsers
expected = DataFrame(
{
"name": ["Raphael", "Donatello", "Miguel Angel", "Leonardo"],
"mask": ["red", "purple", "orange", "blue"],
"weapon": ["sai", "bo staff", "nunchunk", "katana"],
}
)
with tm.ensure_clean() as file:
expected.to_csv(file, index=False, encoding=encoding)
df = parser.read_csv(file, encoding=encoding, memory_map=True)
tm.assert_frame_equal(df, expected)
@xfail_pyarrow
def test_chunk_splits_multibyte_char(all_parsers):
"""
Chunk splits a multibyte character with memory_map=True
GH 43540
"""
parser = all_parsers
# DEFAULT_CHUNKSIZE = 262144, defined in parsers.pyx
df = DataFrame(data=["a" * 127] * 2048)
# Put two-bytes utf-8 encoded character "ą" at the end of chunk
# utf-8 encoding of "ą" is b'\xc4\x85'
df.iloc[2047] = "a" * 127 + "ą"
with tm.ensure_clean("bug-gh43540.csv") as fname:
df.to_csv(fname, index=False, header=False, encoding="utf-8")
dfr = parser.read_csv(fname, header=None, memory_map=True, engine="c")
tm.assert_frame_equal(dfr, df)
@xfail_pyarrow
def test_readcsv_memmap_utf8(all_parsers):
"""
GH 43787
Test correct handling of UTF-8 chars when memory_map=True and encoding is UTF-8
"""
lines = []
line_length = 128
start_char = " "
end_char = "\U00010080"
# This for loop creates a list of 128-char strings
# consisting of consecutive Unicode chars
for lnum in range(ord(start_char), ord(end_char), line_length):
line = "".join([chr(c) for c in range(lnum, lnum + 0x80)]) + "\n"
try:
line.encode("utf-8")
except UnicodeEncodeError:
continue
lines.append(line)
parser = all_parsers
df = DataFrame(lines)
with tm.ensure_clean("utf8test.csv") as fname:
df.to_csv(fname, index=False, header=False, encoding="utf-8")
dfr = parser.read_csv(
fname, header=None, memory_map=True, engine="c", encoding="utf-8"
)
tm.assert_frame_equal(df, dfr)
@pytest.mark.usefixtures("pyarrow_xfail")
@pytest.mark.parametrize("mode", ["w+b", "w+t"])
def test_not_readable(all_parsers, mode):
# GH43439
parser = all_parsers
content = b"abcd"
if "t" in mode:
content = "abcd"
with tempfile.SpooledTemporaryFile(mode=mode, encoding="utf-8") as handle:
handle.write(content)
handle.seek(0)
df = parser.read_csv(handle)
expected = DataFrame([], columns=["abcd"])
tm.assert_frame_equal(df, expected)

View File

@@ -0,0 +1,686 @@
"""
Tests that the file header is properly handled or inferred
during parsing for all of the parsers defined in parsers.py
"""
from collections import namedtuple
from io import StringIO
import numpy as np
import pytest
from pandas.errors import ParserError
from pandas import (
DataFrame,
Index,
MultiIndex,
)
import pandas._testing as tm
# TODO(1.4): Change me to xfails at release time
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@skip_pyarrow
def test_read_with_bad_header(all_parsers):
parser = all_parsers
msg = r"but only \d+ lines in file"
with pytest.raises(ValueError, match=msg):
s = StringIO(",,")
parser.read_csv(s, header=[10])
def test_negative_header(all_parsers):
# see gh-27779
parser = all_parsers
data = """1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""
with pytest.raises(
ValueError,
match="Passing negative integer to header is invalid. "
"For no header, use header=None instead",
):
parser.read_csv(StringIO(data), header=-1)
@pytest.mark.parametrize("header", [([-1, 2, 4]), ([-5, 0])])
def test_negative_multi_index_header(all_parsers, header):
# see gh-27779
parser = all_parsers
data = """1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""
with pytest.raises(
ValueError, match="cannot specify multi-index header with negative integers"
):
parser.read_csv(StringIO(data), header=header)
@pytest.mark.parametrize("header", [True, False])
def test_bool_header_arg(all_parsers, header):
# see gh-6114
parser = all_parsers
data = """\
MyColumn
a
b
a
b"""
msg = "Passing a bool to header is invalid"
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), header=header)
@skip_pyarrow
def test_header_with_index_col(all_parsers):
parser = all_parsers
data = """foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
names = ["A", "B", "C"]
result = parser.read_csv(StringIO(data), names=names)
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=["foo", "bar", "baz"],
columns=["A", "B", "C"],
)
tm.assert_frame_equal(result, expected)
def test_header_not_first_line(all_parsers):
parser = all_parsers
data = """got,to,ignore,this,line
got,to,ignore,this,line
index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
"""
data2 = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
"""
result = parser.read_csv(StringIO(data), header=2, index_col=0)
expected = parser.read_csv(StringIO(data2), header=0, index_col=0)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_header_multi_index(all_parsers):
parser = all_parsers
expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
data = """\
C0,,C_l0_g0,C_l0_g1,C_l0_g2
C1,,C_l1_g0,C_l1_g1,C_l1_g2
C2,,C_l2_g0,C_l2_g1,C_l2_g2
C3,,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,,,
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
"""
result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,msg",
[
(
{"index_col": ["foo", "bar"]},
(
"index_col must only contain "
"row numbers when specifying "
"a multi-index header"
),
),
(
{"index_col": [0, 1], "names": ["foo", "bar"]},
("cannot specify names when specifying a multi-index header"),
),
(
{"index_col": [0, 1], "usecols": ["foo", "bar"]},
("cannot specify usecols when specifying a multi-index header"),
),
],
)
def test_header_multi_index_invalid(all_parsers, kwargs, msg):
data = """\
C0,,C_l0_g0,C_l0_g1,C_l0_g2
C1,,C_l1_g0,C_l1_g1,C_l1_g2
C2,,C_l2_g0,C_l2_g1,C_l2_g2
C3,,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,,,
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
"""
parser = all_parsers
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), header=[0, 1, 2, 3], **kwargs)
_TestTuple = namedtuple("_TestTuple", ["first", "second"])
@skip_pyarrow
@pytest.mark.parametrize(
"kwargs",
[
{"header": [0, 1]},
{
"skiprows": 3,
"names": [
("a", "q"),
("a", "r"),
("a", "s"),
("b", "t"),
("c", "u"),
("c", "v"),
],
},
{
"skiprows": 3,
"names": [
_TestTuple("a", "q"),
_TestTuple("a", "r"),
_TestTuple("a", "s"),
_TestTuple("b", "t"),
_TestTuple("c", "u"),
_TestTuple("c", "v"),
],
},
],
)
def test_header_multi_index_common_format1(all_parsers, kwargs):
parser = all_parsers
expected = DataFrame(
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
index=["one", "two"],
columns=MultiIndex.from_tuples(
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
),
)
data = """,a,a,a,b,c,c
,q,r,s,t,u,v
,,,,,,
one,1,2,3,4,5,6
two,7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize(
"kwargs",
[
{"header": [0, 1]},
{
"skiprows": 2,
"names": [
("a", "q"),
("a", "r"),
("a", "s"),
("b", "t"),
("c", "u"),
("c", "v"),
],
},
{
"skiprows": 2,
"names": [
_TestTuple("a", "q"),
_TestTuple("a", "r"),
_TestTuple("a", "s"),
_TestTuple("b", "t"),
_TestTuple("c", "u"),
_TestTuple("c", "v"),
],
},
],
)
def test_header_multi_index_common_format2(all_parsers, kwargs):
parser = all_parsers
expected = DataFrame(
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
index=["one", "two"],
columns=MultiIndex.from_tuples(
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
),
)
data = """,a,a,a,b,c,c
,q,r,s,t,u,v
one,1,2,3,4,5,6
two,7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize(
"kwargs",
[
{"header": [0, 1]},
{
"skiprows": 2,
"names": [
("a", "q"),
("a", "r"),
("a", "s"),
("b", "t"),
("c", "u"),
("c", "v"),
],
},
{
"skiprows": 2,
"names": [
_TestTuple("a", "q"),
_TestTuple("a", "r"),
_TestTuple("a", "s"),
_TestTuple("b", "t"),
_TestTuple("c", "u"),
_TestTuple("c", "v"),
],
},
],
)
def test_header_multi_index_common_format3(all_parsers, kwargs):
parser = all_parsers
expected = DataFrame(
[[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
index=["one", "two"],
columns=MultiIndex.from_tuples(
[("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
),
)
expected = expected.reset_index(drop=True)
data = """a,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), index_col=None, **kwargs)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_header_multi_index_common_format_malformed1(all_parsers):
parser = all_parsers
expected = DataFrame(
np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
index=Index([1, 7]),
columns=MultiIndex(
levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]],
codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
names=["a", "q"],
),
)
data = """a,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
tm.assert_frame_equal(expected, result)
@skip_pyarrow
def test_header_multi_index_common_format_malformed2(all_parsers):
parser = all_parsers
expected = DataFrame(
np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
index=Index([1, 7]),
columns=MultiIndex(
levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]],
codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
names=[None, "q"],
),
)
data = """,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
tm.assert_frame_equal(expected, result)
@skip_pyarrow
def test_header_multi_index_common_format_malformed3(all_parsers):
parser = all_parsers
expected = DataFrame(
np.array([[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"),
index=MultiIndex(levels=[[1, 7], [2, 8]], codes=[[0, 1], [0, 1]]),
columns=MultiIndex(
levels=[["a", "b", "c"], ["s", "t", "u", "v"]],
codes=[[0, 1, 2, 2], [0, 1, 2, 3]],
names=[None, "q"],
),
)
data = """,a,a,b,c,c
q,r,s,t,u,v
1,2,3,4,5,6
7,8,9,10,11,12"""
result = parser.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1])
tm.assert_frame_equal(expected, result)
@skip_pyarrow
def test_header_multi_index_blank_line(all_parsers):
# GH 40442
parser = all_parsers
data = [[None, None], [1, 2], [3, 4]]
columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")])
expected = DataFrame(data, columns=columns)
data = "a,b\nA,B\n,\n1,2\n3,4"
result = parser.read_csv(StringIO(data), header=[0, 1])
tm.assert_frame_equal(expected, result)
@skip_pyarrow
@pytest.mark.parametrize(
"data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)]
)
def test_header_names_backward_compat(all_parsers, data, header):
# see gh-2539
parser = all_parsers
expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), names=["a", "b", "c"])
result = parser.read_csv(StringIO(data), names=["a", "b", "c"], header=header)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize("kwargs", [{}, {"index_col": False}])
def test_read_only_header_no_rows(all_parsers, kwargs):
# See gh-7773
parser = all_parsers
expected = DataFrame(columns=["a", "b", "c"])
result = parser.read_csv(StringIO("a,b,c"), **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,names",
[
({}, [0, 1, 2, 3, 4]),
(
{"names": ["foo", "bar", "baz", "quux", "panda"]},
["foo", "bar", "baz", "quux", "panda"],
),
],
)
def test_no_header(all_parsers, kwargs, names):
parser = all_parsers
data = """1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""
expected = DataFrame(
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], columns=names
)
result = parser.read_csv(StringIO(data), header=None, **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("header", [["a", "b"], "string_header"])
def test_non_int_header(all_parsers, header):
# see gh-16338
msg = "header must be integer or list of integers"
data = """1,2\n3,4"""
parser = all_parsers
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), header=header)
@skip_pyarrow
def test_singleton_header(all_parsers):
# see gh-7757
data = """a,b,c\n0,1,2\n1,2,3"""
parser = all_parsers
expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
result = parser.read_csv(StringIO(data), header=[0])
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize(
"data,expected",
[
(
"A,A,A,B\none,one,one,two\n0,40,34,0.1",
DataFrame(
[[0, 40, 34, 0.1]],
columns=MultiIndex.from_tuples(
[("A", "one"), ("A", "one.1"), ("A", "one.2"), ("B", "two")]
),
),
),
(
"A,A,A,B\none,one,one.1,two\n0,40,34,0.1",
DataFrame(
[[0, 40, 34, 0.1]],
columns=MultiIndex.from_tuples(
[("A", "one"), ("A", "one.1"), ("A", "one.1.1"), ("B", "two")]
),
),
),
(
"A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1",
DataFrame(
[[0, 40, 34, 0.1, 0.1]],
columns=MultiIndex.from_tuples(
[
("A", "one"),
("A", "one.1"),
("A", "one.1.1"),
("B", "two"),
("B", "two.1"),
]
),
),
),
],
)
def test_mangles_multi_index(all_parsers, data, expected):
# see gh-18062
parser = all_parsers
result = parser.read_csv(StringIO(data), header=[0, 1])
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize("index_col", [None, [0]])
@pytest.mark.parametrize(
"columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])]
)
def test_multi_index_unnamed(all_parsers, index_col, columns):
# see gh-23687
#
# When specifying a multi-index header, make sure that
# we don't error just because one of the rows in our header
# has ALL column names containing the string "Unnamed". The
# correct condition to check is whether the row contains
# ALL columns that did not have names (and instead were given
# placeholder ones).
parser = all_parsers
header = [0, 1]
if index_col is None:
data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n"
else:
data = ",".join([""] + (columns or ["", ""])) + "\n,0,1\n0,2,3\n1,4,5\n"
result = parser.read_csv(StringIO(data), header=header, index_col=index_col)
exp_columns = []
if columns is None:
columns = ["", "", ""]
for i, col in enumerate(columns):
if not col: # Unnamed.
col = f"Unnamed: {i if index_col is None else i + 1}_level_0"
exp_columns.append(col)
columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"]))
expected = DataFrame([[2, 3], [4, 5]], columns=columns)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_names_longer_than_header_but_equal_with_data_rows(all_parsers):
# GH#38453
parser = all_parsers
data = """a, b
1,2,3
5,6,4
"""
result = parser.read_csv(StringIO(data), header=0, names=["A", "B", "C"])
expected = DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 4]})
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_read_csv_multiindex_columns(all_parsers):
# GH#6051
parser = all_parsers
s1 = "Male, Male, Male, Female, Female\nR, R, L, R, R\n.86, .67, .88, .78, .81"
s2 = (
"Male, Male, Male, Female, Female\n"
"R, R, L, R, R\n"
".86, .67, .88, .78, .81\n"
".86, .67, .88, .78, .82"
)
mi = MultiIndex.from_tuples(
[
("Male", "R"),
(" Male", " R"),
(" Male", " L"),
(" Female", " R"),
(" Female", " R.1"),
]
)
expected = DataFrame(
[[0.86, 0.67, 0.88, 0.78, 0.81], [0.86, 0.67, 0.88, 0.78, 0.82]], columns=mi
)
df1 = parser.read_csv(StringIO(s1), header=[0, 1])
tm.assert_frame_equal(df1, expected.iloc[:1])
df2 = parser.read_csv(StringIO(s2), header=[0, 1])
tm.assert_frame_equal(df2, expected)
@skip_pyarrow
def test_read_csv_multi_header_length_check(all_parsers):
# GH#43102
parser = all_parsers
case = """row11,row12,row13
row21,row22, row23
row31,row32
"""
with pytest.raises(
ParserError, match="Header rows must have an equal number of columns."
):
parser.read_csv(StringIO(case), header=[0, 2])
@skip_pyarrow
def test_header_none_and_implicit_index(all_parsers):
# GH#22144
parser = all_parsers
data = "x,1,5\ny,2\nz,3\n"
result = parser.read_csv(StringIO(data), names=["a", "b"], header=None)
expected = DataFrame(
{"a": [1, 2, 3], "b": [5, np.nan, np.nan]}, index=["x", "y", "z"]
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_header_none_and_implicit_index_in_second_row(all_parsers):
# GH#22144
parser = all_parsers
data = "x,1\ny,2,5\nz,3\n"
with pytest.raises(ParserError, match="Expected 2 fields in line 2, saw 3"):
parser.read_csv(StringIO(data), names=["a", "b"], header=None)
@skip_pyarrow
def test_header_none_and_on_bad_lines_skip(all_parsers):
# GH#22144
parser = all_parsers
data = "x,1\ny,2,5\nz,3\n"
result = parser.read_csv(
StringIO(data), names=["a", "b"], header=None, on_bad_lines="skip"
)
expected = DataFrame({"a": ["x", "z"], "b": [1, 3]})
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_header_missing_rows(all_parsers):
# GH#47400
parser = all_parsers
data = """a,b
1,2
"""
msg = r"Passed header=\[0,1,2\], len of 3, but only 2 lines in file"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), header=[0, 1, 2])
@skip_pyarrow
def test_header_multiple_whitespaces(all_parsers):
# GH#54931
parser = all_parsers
data = """aa bb(1,1) cc(1,1)
0 2 3.5"""
result = parser.read_csv(StringIO(data), sep=r"\s+")
expected = DataFrame({"aa": [0], "bb(1,1)": 2, "cc(1,1)": 3.5})
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_header_delim_whitespace(all_parsers):
# GH#54918
parser = all_parsers
data = """a,b
1,2
3,4
"""
result = parser.read_csv(StringIO(data), delim_whitespace=True)
expected = DataFrame({"a,b": ["1,2", "3,4"]})
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,369 @@
"""
Tests that the specified index column (a.k.a "index_col")
is properly handled or inferred during parsing for all of
the parsers defined in parsers.py
"""
from io import StringIO
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
MultiIndex,
)
import pandas._testing as tm
# TODO(1.4): Change me to xfails at release time
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@pytest.mark.parametrize("with_header", [True, False])
def test_index_col_named(all_parsers, with_header):
parser = all_parsers
no_header = """\
KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000"""
header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n"
if with_header:
data = header + no_header
result = parser.read_csv(StringIO(data), index_col="ID")
expected = parser.read_csv(StringIO(data), header=0).set_index("ID")
tm.assert_frame_equal(result, expected)
else:
data = no_header
msg = "Index ID invalid"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), index_col="ID")
def test_index_col_named2(all_parsers):
parser = all_parsers
data = """\
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
"""
expected = DataFrame(
{"a": [1, 5, 9], "b": [2, 6, 10], "c": [3, 7, 11], "d": [4, 8, 12]},
index=Index(["hello", "world", "foo"], name="message"),
)
names = ["a", "b", "c", "d", "message"]
result = parser.read_csv(StringIO(data), names=names, index_col=["message"])
tm.assert_frame_equal(result, expected)
def test_index_col_is_true(all_parsers):
# see gh-9798
data = "a,b\n1,2"
parser = all_parsers
msg = "The value of index_col couldn't be 'True'"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), index_col=True)
@skip_pyarrow
def test_infer_index_col(all_parsers):
data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
parser = all_parsers
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
index=["foo", "bar", "baz"],
columns=["A", "B", "C"],
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize(
"index_col,kwargs",
[
(None, {"columns": ["x", "y", "z"]}),
(False, {"columns": ["x", "y", "z"]}),
(0, {"columns": ["y", "z"], "index": Index([], name="x")}),
(1, {"columns": ["x", "z"], "index": Index([], name="y")}),
("x", {"columns": ["y", "z"], "index": Index([], name="x")}),
("y", {"columns": ["x", "z"], "index": Index([], name="y")}),
(
[0, 1],
{
"columns": ["z"],
"index": MultiIndex.from_arrays([[]] * 2, names=["x", "y"]),
},
),
(
["x", "y"],
{
"columns": ["z"],
"index": MultiIndex.from_arrays([[]] * 2, names=["x", "y"]),
},
),
(
[1, 0],
{
"columns": ["z"],
"index": MultiIndex.from_arrays([[]] * 2, names=["y", "x"]),
},
),
(
["y", "x"],
{
"columns": ["z"],
"index": MultiIndex.from_arrays([[]] * 2, names=["y", "x"]),
},
),
],
)
def test_index_col_empty_data(all_parsers, index_col, kwargs):
data = "x,y,z"
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=index_col)
expected = DataFrame(**kwargs)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_empty_with_index_col_false(all_parsers):
# see gh-10413
data = "x,y"
parser = all_parsers
result = parser.read_csv(StringIO(data), index_col=False)
expected = DataFrame(columns=["x", "y"])
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize(
"index_names",
[
["", ""],
["foo", ""],
["", "bar"],
["foo", "bar"],
["NotReallyUnnamed", "Unnamed: 0"],
],
)
def test_multi_index_naming(all_parsers, index_names):
parser = all_parsers
# We don't want empty index names being replaced with "Unnamed: 0"
data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"])
result = parser.read_csv(StringIO(data), index_col=[0, 1])
expected = DataFrame(
{"col": [1, 2, 3, 4]}, index=MultiIndex.from_product([["a", "b"], ["c", "d"]])
)
expected.index.names = [name if name else None for name in index_names]
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_multi_index_naming_not_all_at_beginning(all_parsers):
parser = all_parsers
data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
result = parser.read_csv(StringIO(data), index_col=[0, 2])
expected = DataFrame(
{"Unnamed: 2": ["c", "d", "c", "d"]},
index=MultiIndex(
levels=[["a", "b"], [1, 2, 3, 4]], codes=[[0, 0, 1, 1], [0, 1, 2, 3]]
),
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_no_multi_index_level_names_empty(all_parsers):
# GH 10984
parser = all_parsers
midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)])
expected = DataFrame(
np.random.default_rng(2).standard_normal((3, 3)),
index=midx,
columns=["x", "y", "z"],
)
with tm.ensure_clean() as path:
expected.to_csv(path)
result = parser.read_csv(path, index_col=[0, 1, 2])
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_header_with_index_col(all_parsers):
# GH 33476
parser = all_parsers
data = """
I11,A,A
I12,B,B
I2,1,3
"""
midx = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"])
idx = Index(["I2"])
expected = DataFrame([[1, 3]], index=idx, columns=midx)
result = parser.read_csv(StringIO(data), index_col=0, header=[0, 1])
tm.assert_frame_equal(result, expected)
col_idx = Index(["A", "A.1"])
idx = Index(["I12", "I2"], name="I11")
expected = DataFrame([["B", "B"], ["1", "3"]], index=idx, columns=col_idx)
result = parser.read_csv(StringIO(data), index_col="I11", header=0)
tm.assert_frame_equal(result, expected)
@pytest.mark.slow
def test_index_col_large_csv(all_parsers, monkeypatch):
# https://github.com/pandas-dev/pandas/issues/37094
parser = all_parsers
ARR_LEN = 100
df = DataFrame(
{
"a": range(ARR_LEN + 1),
"b": np.random.default_rng(2).standard_normal(ARR_LEN + 1),
}
)
with tm.ensure_clean() as path:
df.to_csv(path, index=False)
with monkeypatch.context() as m:
m.setattr("pandas.core.algorithms._MINIMUM_COMP_ARR_LEN", ARR_LEN)
result = parser.read_csv(path, index_col=[0])
tm.assert_frame_equal(result, df.set_index("a"))
@skip_pyarrow
def test_index_col_multiindex_columns_no_data(all_parsers):
# GH#38292
parser = all_parsers
result = parser.read_csv(
StringIO("a0,a1,a2\nb0,b1,b2\n"), header=[0, 1], index_col=0
)
expected = DataFrame(
[],
index=Index([]),
columns=MultiIndex.from_arrays(
[["a1", "a2"], ["b1", "b2"]], names=["a0", "b0"]
),
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_index_col_header_no_data(all_parsers):
# GH#38292
parser = all_parsers
result = parser.read_csv(StringIO("a0,a1,a2\n"), header=[0], index_col=0)
expected = DataFrame(
[],
columns=["a1", "a2"],
index=Index([], name="a0"),
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_multiindex_columns_no_data(all_parsers):
# GH#38292
parser = all_parsers
result = parser.read_csv(StringIO("a0,a1,a2\nb0,b1,b2\n"), header=[0, 1])
expected = DataFrame(
[], columns=MultiIndex.from_arrays([["a0", "a1", "a2"], ["b0", "b1", "b2"]])
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_multiindex_columns_index_col_with_data(all_parsers):
# GH#38292
parser = all_parsers
result = parser.read_csv(
StringIO("a0,a1,a2\nb0,b1,b2\ndata,data,data"), header=[0, 1], index_col=0
)
expected = DataFrame(
[["data", "data"]],
columns=MultiIndex.from_arrays(
[["a1", "a2"], ["b1", "b2"]], names=["a0", "b0"]
),
index=Index(["data"]),
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_infer_types_boolean_sum(all_parsers):
# GH#44079
parser = all_parsers
result = parser.read_csv(
StringIO("0,1"),
names=["a", "b"],
index_col=["a"],
dtype={"a": "UInt8"},
)
expected = DataFrame(
data={
"a": [
0,
],
"b": [1],
}
).set_index("a")
# Not checking index type now, because the C parser will return a
# index column of dtype 'object', and the Python parser will return a
# index column of dtype 'int64'.
tm.assert_frame_equal(result, expected, check_index_type=False)
@pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)])
def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
# GH#9435
data = "a,b\n01,2"
parser = all_parsers
if dtype == object and parser.engine == "pyarrow":
request.node.add_marker(
pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine")
)
result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype})
expected = DataFrame({"b": [2]}, index=Index([val], name="a"))
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_multiindex_columns_not_leading_index_col(all_parsers):
# GH#38549
parser = all_parsers
data = """a,b,c,d
e,f,g,h
x,y,1,2
"""
result = parser.read_csv(
StringIO(data),
header=[0, 1],
index_col=1,
)
cols = MultiIndex.from_tuples(
[("a", "e"), ("c", "g"), ("d", "h")], names=["b", "f"]
)
expected = DataFrame([["x", 1, 2]], columns=cols, index=["y"])
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,176 @@
"""
Tests that duplicate columns are handled appropriately when parsed by the
CSV engine. In general, the expected result is that they are either thoroughly
de-duplicated (if mangling requested) or ignored otherwise.
"""
from io import StringIO
import pytest
from pandas import DataFrame
import pandas._testing as tm
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
@skip_pyarrow
def test_basic(all_parsers):
parser = all_parsers
data = "a,a,b,b,b\n1,2,3,4,5"
result = parser.read_csv(StringIO(data), sep=",")
expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"])
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_basic_names(all_parsers):
# See gh-7160
parser = all_parsers
data = "a,b,a\n0,1,2\n3,4,5"
expected = DataFrame([[0, 1, 2], [3, 4, 5]], columns=["a", "b", "a.1"])
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
def test_basic_names_raise(all_parsers):
# See gh-7160
parser = all_parsers
data = "0,1,2\n3,4,5"
with pytest.raises(ValueError, match="Duplicate names"):
parser.read_csv(StringIO(data), names=["a", "b", "a"])
@skip_pyarrow
@pytest.mark.parametrize(
"data,expected",
[
("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.2", "a.1"])),
(
"a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6",
DataFrame(
[[1, 2, 3, 4, 5, 6]],
columns=["a", "a.2", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
),
),
(
"a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7",
DataFrame(
[[1, 2, 3, 4, 5, 6, 7]],
columns=["a", "a.4", "a.3", "a.1", "a.2", "a.5", "a.6"],
),
),
],
)
def test_thorough_mangle_columns(all_parsers, data, expected):
# see gh-17060
parser = all_parsers
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize(
"data,names,expected",
[
(
"a,b,b\n1,2,3",
["a.1", "a.1", "a.1.1"],
DataFrame(
[["a", "b", "b"], ["1", "2", "3"]], columns=["a.1", "a.1.1", "a.1.1.1"]
),
),
(
"a,b,c,d,e,f\n1,2,3,4,5,6",
["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
DataFrame(
[["a", "b", "c", "d", "e", "f"], ["1", "2", "3", "4", "5", "6"]],
columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"],
),
),
(
"a,b,c,d,e,f,g\n1,2,3,4,5,6,7",
["a", "a", "a.3", "a.1", "a.2", "a", "a"],
DataFrame(
[
["a", "b", "c", "d", "e", "f", "g"],
["1", "2", "3", "4", "5", "6", "7"],
],
columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"],
),
),
],
)
def test_thorough_mangle_names(all_parsers, data, names, expected):
# see gh-17095
parser = all_parsers
with pytest.raises(ValueError, match="Duplicate names"):
parser.read_csv(StringIO(data), names=names)
@skip_pyarrow
def test_mangled_unnamed_placeholders(all_parsers):
# xref gh-13017
orig_key = "0"
parser = all_parsers
orig_value = [1, 2, 3]
df = DataFrame({orig_key: orig_value})
# This test recursively updates `df`.
for i in range(3):
expected = DataFrame()
for j in range(i + 1):
col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1)
expected.insert(loc=0, column=col_name, value=[0, 1, 2])
expected[orig_key] = orig_value
df = parser.read_csv(StringIO(df.to_csv()))
tm.assert_frame_equal(df, expected)
@skip_pyarrow
def test_mangle_dupe_cols_already_exists(all_parsers):
# GH#14704
parser = all_parsers
data = "a,a,a.1,a,a.3,a.1,a.1.1\n1,2,3,4,5,6,7"
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[[1, 2, 3, 4, 5, 6, 7]],
columns=["a", "a.2", "a.1", "a.4", "a.3", "a.1.2", "a.1.1"],
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers):
# GH#14704
parser = all_parsers
data = ",Unnamed: 0,,Unnamed: 2\n1,2,3,4"
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[[1, 2, 3, 4]],
columns=["Unnamed: 0.1", "Unnamed: 0", "Unnamed: 2.1", "Unnamed: 2"],
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize("usecol, engine", [([0, 1, 1], "python"), ([0, 1, 1], "c")])
def test_mangle_cols_names(all_parsers, usecol, engine):
# GH 11823
parser = all_parsers
data = "1,2,3"
names = ["A", "A", "B"]
with pytest.raises(ValueError, match="Duplicate names"):
parser.read_csv(StringIO(data), names=names, usecols=usecol, engine=engine)

View File

@@ -0,0 +1,147 @@
"""
Tests multithreading behaviour for reading and
parsing files for each parser defined in parsers.py
"""
from contextlib import ExitStack
from io import BytesIO
from multiprocessing.pool import ThreadPool
import numpy as np
import pytest
import pandas as pd
from pandas import DataFrame
import pandas._testing as tm
# We'll probably always skip these for pyarrow
# Maybe we'll add our own tests for pyarrow too
pytestmark = [
pytest.mark.single_cpu,
pytest.mark.slow,
pytest.mark.usefixtures("pyarrow_skip"),
]
def test_multi_thread_string_io_read_csv(all_parsers):
# see gh-11786
parser = all_parsers
max_row_range = 100
num_files = 10
bytes_to_df = (
"\n".join([f"{i:d},{i:d},{i:d}" for i in range(max_row_range)]).encode()
for _ in range(num_files)
)
# Read all files in many threads.
with ExitStack() as stack:
files = [stack.enter_context(BytesIO(b)) for b in bytes_to_df]
pool = stack.enter_context(ThreadPool(8))
results = pool.map(parser.read_csv, files)
first_result = results[0]
for result in results:
tm.assert_frame_equal(first_result, result)
def _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks):
"""
Generate a DataFrame via multi-thread.
Parameters
----------
parser : BaseParser
The parser object to use for reading the data.
path : str
The location of the CSV file to read.
num_rows : int
The number of rows to read per task.
num_tasks : int
The number of tasks to use for reading this DataFrame.
Returns
-------
df : DataFrame
"""
def reader(arg):
"""
Create a reader for part of the CSV.
Parameters
----------
arg : tuple
A tuple of the following:
* start : int
The starting row to start for parsing CSV
* nrows : int
The number of rows to read.
Returns
-------
df : DataFrame
"""
start, nrows = arg
if not start:
return parser.read_csv(
path, index_col=0, header=0, nrows=nrows, parse_dates=["date"]
)
return parser.read_csv(
path,
index_col=0,
header=None,
skiprows=int(start) + 1,
nrows=nrows,
parse_dates=[9],
)
tasks = [
(num_rows * i // num_tasks, num_rows // num_tasks) for i in range(num_tasks)
]
with ThreadPool(processes=num_tasks) as pool:
results = pool.map(reader, tasks)
header = results[0].columns
for r in results[1:]:
r.columns = header
final_dataframe = pd.concat(results)
return final_dataframe
def test_multi_thread_path_multipart_read_csv(all_parsers):
# see gh-11786
num_tasks = 4
num_rows = 48
parser = all_parsers
file_name = "__thread_pool_reader__.csv"
df = DataFrame(
{
"a": np.random.default_rng(2).random(num_rows),
"b": np.random.default_rng(2).random(num_rows),
"c": np.random.default_rng(2).random(num_rows),
"d": np.random.default_rng(2).random(num_rows),
"e": np.random.default_rng(2).random(num_rows),
"foo": ["foo"] * num_rows,
"bar": ["bar"] * num_rows,
"baz": ["baz"] * num_rows,
"date": pd.date_range("20000101 09:00:00", periods=num_rows, freq="s"),
"int": np.arange(num_rows, dtype="int64"),
}
)
with tm.ensure_clean(file_name) as path:
df.to_csv(path)
final_dataframe = _generate_multi_thread_dataframe(
parser, path, num_rows, num_tasks
)
tm.assert_frame_equal(df, final_dataframe)

View File

@@ -0,0 +1,671 @@
"""
Tests that NA values are properly handled during
parsing for all of the parsers defined in parsers.py
"""
from io import StringIO
import numpy as np
import pytest
from pandas._libs.parsers import STR_NA_VALUES
from pandas import (
DataFrame,
Index,
MultiIndex,
)
import pandas._testing as tm
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
def test_string_nas(all_parsers):
parser = all_parsers
data = """A,B,C
a,b,c
d,,f
,g,h
"""
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[["a", "b", "c"], ["d", np.nan, "f"], [np.nan, "g", "h"]],
columns=["A", "B", "C"],
)
if parser.engine == "pyarrow":
expected.loc[2, "A"] = None
expected.loc[1, "B"] = None
tm.assert_frame_equal(result, expected)
def test_detect_string_na(all_parsers):
parser = all_parsers
data = """A,B
foo,bar
NA,baz
NaN,nan
"""
expected = DataFrame(
[["foo", "bar"], [np.nan, "baz"], [np.nan, np.nan]], columns=["A", "B"]
)
if parser.engine == "pyarrow":
expected.loc[[1, 2], "A"] = None
expected.loc[2, "B"] = None
result = parser.read_csv(StringIO(data))
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize(
"na_values",
[
["-999.0", "-999"],
[-999, -999.0],
[-999.0, -999],
["-999.0"],
["-999"],
[-999.0],
[-999],
],
)
@pytest.mark.parametrize(
"data",
[
"""A,B
-999,1.2
2,-999
3,4.5
""",
"""A,B
-999,1.200
2,-999.000
3,4.500
""",
],
)
def test_non_string_na_values(all_parsers, data, na_values):
# see gh-3611: with an odd float format, we can't match
# the string "999.0" exactly but still need float matching
parser = all_parsers
expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], [3.0, 4.5]], columns=["A", "B"])
result = parser.read_csv(StringIO(data), na_values=na_values)
tm.assert_frame_equal(result, expected)
def test_default_na_values(all_parsers):
_NA_VALUES = {
"-1.#IND",
"1.#QNAN",
"1.#IND",
"-1.#QNAN",
"#N/A",
"N/A",
"n/a",
"NA",
"<NA>",
"#NA",
"NULL",
"null",
"NaN",
"nan",
"-NaN",
"-nan",
"#N/A N/A",
"",
"None",
}
assert _NA_VALUES == STR_NA_VALUES
parser = all_parsers
nv = len(_NA_VALUES)
def f(i, v):
if i == 0:
buf = ""
elif i > 0:
buf = "".join([","] * i)
buf = f"{buf}{v}"
if i < nv - 1:
joined = "".join([","] * (nv - i - 1))
buf = f"{buf}{joined}"
return buf
data = StringIO("\n".join([f(i, v) for i, v in enumerate(_NA_VALUES)]))
expected = DataFrame(np.nan, columns=range(nv), index=range(nv))
result = parser.read_csv(data, header=None)
tm.assert_frame_equal(result, expected)
# TODO: needs skiprows list support in pyarrow
@skip_pyarrow
@pytest.mark.parametrize("na_values", ["baz", ["baz"]])
def test_custom_na_values(all_parsers, na_values):
parser = all_parsers
data = """A,B,C
ignore,this,row
1,NA,3
-1.#IND,5,baz
7,8,NaN
"""
expected = DataFrame(
[[1.0, np.nan, 3], [np.nan, 5, np.nan], [7, 8, np.nan]], columns=["A", "B", "C"]
)
result = parser.read_csv(StringIO(data), na_values=na_values, skiprows=[1])
tm.assert_frame_equal(result, expected)
def test_bool_na_values(all_parsers):
data = """A,B,C
True,False,True
NA,True,False
False,NA,True"""
parser = all_parsers
result = parser.read_csv(StringIO(data))
expected = DataFrame(
{
"A": np.array([True, np.nan, False], dtype=object),
"B": np.array([False, True, np.nan], dtype=object),
"C": [True, False, True],
}
)
if parser.engine == "pyarrow":
expected.loc[1, "A"] = None
expected.loc[2, "B"] = None
tm.assert_frame_equal(result, expected)
# TODO: Needs pyarrow support for dictionary in na_values
@skip_pyarrow
def test_na_value_dict(all_parsers):
data = """A,B,C
foo,bar,NA
bar,foo,foo
foo,bar,NA
bar,foo,foo"""
parser = all_parsers
df = parser.read_csv(StringIO(data), na_values={"A": ["foo"], "B": ["bar"]})
expected = DataFrame(
{
"A": [np.nan, "bar", np.nan, "bar"],
"B": [np.nan, "foo", np.nan, "foo"],
"C": [np.nan, "foo", np.nan, "foo"],
}
)
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize(
"index_col,expected",
[
(
[0],
DataFrame({"b": [np.nan], "c": [1], "d": [5]}, index=Index([0], name="a")),
),
(
[0, 2],
DataFrame(
{"b": [np.nan], "d": [5]},
index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]),
),
),
(
["a", "c"],
DataFrame(
{"b": [np.nan], "d": [5]},
index=MultiIndex.from_tuples([(0, 1)], names=["a", "c"]),
),
),
],
)
def test_na_value_dict_multi_index(all_parsers, index_col, expected):
data = """\
a,b,c,d
0,NA,1,5
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), na_values=set(), index_col=index_col)
tm.assert_frame_equal(result, expected)
# TODO: xfail components of this test, the first one passes
@skip_pyarrow
@pytest.mark.parametrize(
"kwargs,expected",
[
(
{},
DataFrame(
{
"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
"B": [1, 2, 3, 4, 5, 6, 7],
"C": ["one", "two", "three", np.nan, "five", np.nan, "seven"],
}
),
),
(
{"na_values": {"A": [], "C": []}, "keep_default_na": False},
DataFrame(
{
"A": ["a", "b", "", "d", "e", "nan", "g"],
"B": [1, 2, 3, 4, 5, 6, 7],
"C": ["one", "two", "three", "nan", "five", "", "seven"],
}
),
),
(
{"na_values": ["a"], "keep_default_na": False},
DataFrame(
{
"A": [np.nan, "b", "", "d", "e", "nan", "g"],
"B": [1, 2, 3, 4, 5, 6, 7],
"C": ["one", "two", "three", "nan", "five", "", "seven"],
}
),
),
(
{"na_values": {"A": [], "C": []}},
DataFrame(
{
"A": ["a", "b", np.nan, "d", "e", np.nan, "g"],
"B": [1, 2, 3, 4, 5, 6, 7],
"C": ["one", "two", "three", np.nan, "five", np.nan, "seven"],
}
),
),
],
)
def test_na_values_keep_default(all_parsers, kwargs, expected):
data = """\
A,B,C
a,1,one
b,2,two
,3,three
d,4,nan
e,5,five
nan,6,
g,7,seven
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
def test_no_na_values_no_keep_default(all_parsers):
# see gh-4318: passing na_values=None and
# keep_default_na=False yields 'None" as a na_value
data = """\
A,B,C
a,1,None
b,2,two
,3,None
d,4,nan
e,5,five
nan,6,
g,7,seven
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), keep_default_na=False)
expected = DataFrame(
{
"A": ["a", "b", "", "d", "e", "nan", "g"],
"B": [1, 2, 3, 4, 5, 6, 7],
"C": ["None", "two", "None", "nan", "five", "", "seven"],
}
)
tm.assert_frame_equal(result, expected)
# TODO: Blocked on na_values dict support in pyarrow
@skip_pyarrow
def test_no_keep_default_na_dict_na_values(all_parsers):
# see gh-19227
data = "a,b\n,2"
parser = all_parsers
result = parser.read_csv(
StringIO(data), na_values={"b": ["2"]}, keep_default_na=False
)
expected = DataFrame({"a": [""], "b": [np.nan]})
tm.assert_frame_equal(result, expected)
# TODO: Blocked on na_values dict support in pyarrow
@skip_pyarrow
def test_no_keep_default_na_dict_na_scalar_values(all_parsers):
# see gh-19227
#
# Scalar values shouldn't cause the parsing to crash or fail.
data = "a,b\n1,2"
parser = all_parsers
df = parser.read_csv(StringIO(data), na_values={"b": 2}, keep_default_na=False)
expected = DataFrame({"a": [1], "b": [np.nan]})
tm.assert_frame_equal(df, expected)
# TODO: Blocked on na_values dict support in pyarrow
@skip_pyarrow
@pytest.mark.parametrize("col_zero_na_values", [113125, "113125"])
def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_values):
# see gh-19227
data = """\
113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
729639,"qwer","",asdfkj,466.681,,252.373
"""
parser = all_parsers
expected = DataFrame(
{
0: [np.nan, 729639.0],
1: [np.nan, "qwer"],
2: ["/blaha", np.nan],
3: ["kjsdkj", "asdfkj"],
4: [412.166, 466.681],
5: ["225.874", ""],
6: [np.nan, 252.373],
}
)
result = parser.read_csv(
StringIO(data),
header=None,
keep_default_na=False,
na_values={2: "", 6: "214.008", 1: "blah", 0: col_zero_na_values},
)
tm.assert_frame_equal(result, expected)
# TODO: Empty null_values doesn't work properly on pyarrow
@skip_pyarrow
@pytest.mark.parametrize(
"na_filter,row_data",
[
(True, [[1, "A"], [np.nan, np.nan], [3, "C"]]),
(False, [["1", "A"], ["nan", "B"], ["3", "C"]]),
],
)
def test_na_values_na_filter_override(all_parsers, na_filter, row_data):
data = """\
A,B
1,A
nan,B
3,C
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter)
expected = DataFrame(row_data, columns=["A", "B"])
tm.assert_frame_equal(result, expected)
# TODO: Arrow parse error
@skip_pyarrow
def test_na_trailing_columns(all_parsers):
parser = all_parsers
data = """Date,Currency,Symbol,Type,Units,UnitPrice,Cost,Tax
2012-03-14,USD,AAPL,BUY,1000
2012-05-12,USD,SBUX,SELL,500"""
# Trailing columns should be all NaN.
result = parser.read_csv(StringIO(data))
expected = DataFrame(
[
["2012-03-14", "USD", "AAPL", "BUY", 1000, np.nan, np.nan, np.nan],
["2012-05-12", "USD", "SBUX", "SELL", 500, np.nan, np.nan, np.nan],
],
columns=[
"Date",
"Currency",
"Symbol",
"Type",
"Units",
"UnitPrice",
"Cost",
"Tax",
],
)
tm.assert_frame_equal(result, expected)
# TODO: xfail the na_values dict case
@skip_pyarrow
@pytest.mark.parametrize(
"na_values,row_data",
[
(1, [[np.nan, 2.0], [2.0, np.nan]]),
({"a": 2, "b": 1}, [[1.0, 2.0], [np.nan, np.nan]]),
],
)
def test_na_values_scalar(all_parsers, na_values, row_data):
# see gh-12224
parser = all_parsers
names = ["a", "b"]
data = "1,2\n2,1"
result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
expected = DataFrame(row_data, columns=names)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_na_values_dict_aliasing(all_parsers):
parser = all_parsers
na_values = {"a": 2, "b": 1}
na_values_copy = na_values.copy()
names = ["a", "b"]
data = "1,2\n2,1"
expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
result = parser.read_csv(StringIO(data), names=names, na_values=na_values)
tm.assert_frame_equal(result, expected)
tm.assert_dict_equal(na_values, na_values_copy)
@skip_pyarrow
def test_na_values_dict_col_index(all_parsers):
# see gh-14203
data = "a\nfoo\n1"
parser = all_parsers
na_values = {0: "foo"}
result = parser.read_csv(StringIO(data), na_values=na_values)
expected = DataFrame({"a": [np.nan, 1]})
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize(
"data,kwargs,expected",
[
(
str(2**63) + "\n" + str(2**63 + 1),
{"na_values": [2**63]},
DataFrame([str(2**63), str(2**63 + 1)]),
),
(str(2**63) + ",1" + "\n,2", {}, DataFrame([[str(2**63), 1], ["", 2]])),
(str(2**63) + "\n1", {"na_values": [2**63]}, DataFrame([np.nan, 1])),
],
)
def test_na_values_uint64(all_parsers, data, kwargs, expected):
# see gh-14983
parser = all_parsers
result = parser.read_csv(StringIO(data), header=None, **kwargs)
tm.assert_frame_equal(result, expected)
def test_empty_na_values_no_default_with_index(all_parsers):
# see gh-15835
data = "a,1\nb,2"
parser = all_parsers
expected = DataFrame({"1": [2]}, index=Index(["b"], name="a"))
result = parser.read_csv(StringIO(data), index_col=0, keep_default_na=False)
tm.assert_frame_equal(result, expected)
# TODO: Missing support for na_filter kewyord
@skip_pyarrow
@pytest.mark.parametrize(
"na_filter,index_data", [(False, ["", "5"]), (True, [np.nan, 5.0])]
)
def test_no_na_filter_on_index(all_parsers, na_filter, index_data):
# see gh-5239
#
# Don't parse NA-values in index unless na_filter=True
parser = all_parsers
data = "a,b,c\n1,,3\n4,5,6"
expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index(index_data, name="b"))
result = parser.read_csv(StringIO(data), index_col=[1], na_filter=na_filter)
tm.assert_frame_equal(result, expected)
def test_inf_na_values_with_int_index(all_parsers):
# see gh-17128
parser = all_parsers
data = "idx,col1,col2\n1,3,4\n2,inf,-inf"
# Don't fail with OverflowError with inf's and integer index column.
out = parser.read_csv(StringIO(data), index_col=[0], na_values=["inf", "-inf"])
expected = DataFrame(
{"col1": [3, np.nan], "col2": [4, np.nan]}, index=Index([1, 2], name="idx")
)
tm.assert_frame_equal(out, expected)
@skip_pyarrow
@pytest.mark.parametrize("na_filter", [True, False])
def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
# see gh-20377
parser = all_parsers
data = "a,b,c\n1,,3\n4,5,6"
# na_filter=True --> missing value becomes NaN.
# na_filter=False --> missing value remains empty string.
empty = np.nan if na_filter else ""
expected = DataFrame({"a": ["1", "4"], "b": [empty, "5"], "c": ["3", "6"]})
result = parser.read_csv(StringIO(data), na_filter=na_filter, dtype=str)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize(
"data, na_values",
[
("false,1\n,1\ntrue", None),
("false,1\nnull,1\ntrue", None),
("false,1\nnan,1\ntrue", None),
("false,1\nfoo,1\ntrue", "foo"),
("false,1\nfoo,1\ntrue", ["foo"]),
("false,1\nfoo,1\ntrue", {"a": "foo"}),
],
)
def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
parser = all_parsers
msg = (
"(Bool column has NA values in column [0a])|"
"(cannot safely convert passed user dtype of "
"bool for object dtyped data in column 0)"
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data),
header=None,
names=["a", "b"],
dtype={"a": "bool"},
na_values=na_values,
)
@skip_pyarrow
def test_str_nan_dropped(all_parsers):
# see gh-21131
parser = all_parsers
data = """File: small.csv,,
10010010233,0123,654
foo,,bar
01001000155,4530,898"""
result = parser.read_csv(
StringIO(data),
header=None,
names=["col1", "col2", "col3"],
dtype={"col1": str, "col2": str, "col3": str},
).dropna()
expected = DataFrame(
{
"col1": ["10010010233", "01001000155"],
"col2": ["0123", "4530"],
"col3": ["654", "898"],
},
index=[1, 3],
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_nan_multi_index(all_parsers):
# GH 42446
parser = all_parsers
data = "A,B,B\nX,Y,Z\n1,2,inf"
result = parser.read_csv(
StringIO(data), header=list(range(2)), na_values={("B", "Z"): "inf"}
)
expected = DataFrame(
{
("A", "X"): [1],
("B", "Y"): [2],
("B", "Z"): [np.nan],
}
)
tm.assert_frame_equal(result, expected)
@xfail_pyarrow
def test_bool_and_nan_to_bool(all_parsers):
# GH#42808
parser = all_parsers
data = """0
NaN
True
False
"""
with pytest.raises(ValueError, match="NA values"):
parser.read_csv(StringIO(data), dtype="bool")
def test_bool_and_nan_to_int(all_parsers):
# GH#42808
parser = all_parsers
data = """0
NaN
True
False
"""
with pytest.raises(ValueError, match="convert|NoneType"):
parser.read_csv(StringIO(data), dtype="int")
def test_bool_and_nan_to_float(all_parsers):
# GH#42808
parser = all_parsers
data = """0
NaN
True
False
"""
result = parser.read_csv(StringIO(data), dtype="float")
expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]})
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,342 @@
"""
Tests parsers ability to read and parse non-local files
and hence require a network connection to be read.
"""
from io import (
BytesIO,
StringIO,
)
import logging
import numpy as np
import pytest
from pandas.compat import is_ci_environment
import pandas.util._test_decorators as td
from pandas import DataFrame
import pandas._testing as tm
from pandas.io.feather_format import read_feather
from pandas.io.parsers import read_csv
@pytest.mark.network
@pytest.mark.single_cpu
@pytest.mark.parametrize("mode", ["explicit", "infer"])
@pytest.mark.parametrize("engine", ["python", "c"])
def test_compressed_urls(
httpserver,
datapath,
salaries_table,
mode,
engine,
compression_only,
compression_to_extension,
):
# test reading compressed urls with various engines and
# extension inference
if compression_only == "tar":
pytest.skip("TODO: Add tar salaraies.csv to pandas/io/parsers/data")
extension = compression_to_extension[compression_only]
with open(datapath("io", "parser", "data", "salaries.csv" + extension), "rb") as f:
httpserver.serve_content(content=f.read())
url = httpserver.url + "/salaries.csv" + extension
if mode != "explicit":
compression_only = mode
url_table = read_csv(url, sep="\t", compression=compression_only, engine=engine)
tm.assert_frame_equal(url_table, salaries_table)
@pytest.mark.network
@pytest.mark.single_cpu
def test_url_encoding_csv(httpserver, datapath):
"""
read_csv should honor the requested encoding for URLs.
GH 10424
"""
with open(datapath("io", "parser", "data", "unicode_series.csv"), "rb") as f:
httpserver.serve_content(content=f.read())
df = read_csv(httpserver.url, encoding="latin-1", header=None)
assert df.loc[15, 1] == "Á köldum klaka (Cold Fever) (1994)"
@pytest.fixture
def tips_df(datapath):
"""DataFrame with the tips dataset."""
return read_csv(datapath("io", "data", "csv", "tips.csv"))
@pytest.mark.single_cpu
@pytest.mark.usefixtures("s3_resource")
@td.skip_if_not_us_locale()
class TestS3:
def test_parse_public_s3_bucket(self, s3_public_bucket_with_data, tips_df, s3so):
# more of an integration test due to the not-public contents portion
# can probably mock this though.
pytest.importorskip("s3fs")
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df = read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
compression=comp,
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
def test_parse_private_s3_bucket(self, s3_private_bucket_with_data, tips_df, s3so):
# Read public file from bucket with not-public contents
pytest.importorskip("s3fs")
df = read_csv(
f"s3://{s3_private_bucket_with_data.name}/tips.csv", storage_options=s3so
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
def test_parse_public_s3n_bucket(self, s3_public_bucket_with_data, tips_df, s3so):
# Read from AWS s3 as "s3n" URL
df = read_csv(
f"s3n://{s3_public_bucket_with_data.name}/tips.csv",
nrows=10,
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
def test_parse_public_s3a_bucket(self, s3_public_bucket_with_data, tips_df, s3so):
# Read from AWS s3 as "s3a" URL
df = read_csv(
f"s3a://{s3_public_bucket_with_data.name}/tips.csv",
nrows=10,
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
def test_parse_public_s3_bucket_nrows(
self, s3_public_bucket_with_data, tips_df, s3so
):
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df = read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
nrows=10,
compression=comp,
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
def test_parse_public_s3_bucket_chunked(
self, s3_public_bucket_with_data, tips_df, s3so
):
# Read with a chunksize
chunksize = 5
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
with read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
chunksize=chunksize,
compression=comp,
storage_options=s3so,
) as df_reader:
assert df_reader.chunksize == chunksize
for i_chunk in [0, 1, 2]:
# Read a couple of chunks and make sure we see them
# properly.
df = df_reader.get_chunk()
assert isinstance(df, DataFrame)
assert not df.empty
true_df = tips_df.iloc[
chunksize * i_chunk : chunksize * (i_chunk + 1)
]
tm.assert_frame_equal(true_df, df)
def test_parse_public_s3_bucket_chunked_python(
self, s3_public_bucket_with_data, tips_df, s3so
):
# Read with a chunksize using the Python parser
chunksize = 5
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
with read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
chunksize=chunksize,
compression=comp,
engine="python",
storage_options=s3so,
) as df_reader:
assert df_reader.chunksize == chunksize
for i_chunk in [0, 1, 2]:
# Read a couple of chunks and make sure we see them properly.
df = df_reader.get_chunk()
assert isinstance(df, DataFrame)
assert not df.empty
true_df = tips_df.iloc[
chunksize * i_chunk : chunksize * (i_chunk + 1)
]
tm.assert_frame_equal(true_df, df)
def test_parse_public_s3_bucket_python(
self, s3_public_bucket_with_data, tips_df, s3so
):
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df = read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
engine="python",
compression=comp,
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
def test_infer_s3_compression(self, s3_public_bucket_with_data, tips_df, s3so):
for ext in ["", ".gz", ".bz2"]:
df = read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
engine="python",
compression="infer",
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
def test_parse_public_s3_bucket_nrows_python(
self, s3_public_bucket_with_data, tips_df, s3so
):
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df = read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips.csv" + ext,
engine="python",
nrows=10,
compression=comp,
storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
def test_read_s3_fails(self, s3so):
msg = "The specified bucket does not exist"
with pytest.raises(OSError, match=msg):
read_csv("s3://nyqpug/asdf.csv", storage_options=s3so)
def test_read_s3_fails_private(self, s3_private_bucket, s3so):
msg = "The specified bucket does not exist"
# Receive a permission error when trying to read a private bucket.
# It's irrelevant here that this isn't actually a table.
with pytest.raises(OSError, match=msg):
read_csv(f"s3://{s3_private_bucket.name}/file.csv")
@pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False)
def test_write_s3_csv_fails(self, tips_df, s3so):
# GH 32486
# Attempting to write to an invalid S3 path should raise
import botocore
# GH 34087
# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html
# Catch a ClientError since AWS Service Errors are defined dynamically
error = (FileNotFoundError, botocore.exceptions.ClientError)
with pytest.raises(error, match="The specified bucket does not exist"):
tips_df.to_csv(
"s3://an_s3_bucket_data_doesnt_exit/not_real.csv", storage_options=s3so
)
@pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False)
def test_write_s3_parquet_fails(self, tips_df, s3so):
# GH 27679
# Attempting to write to an invalid S3 path should raise
pytest.importorskip("pyarrow")
import botocore
# GH 34087
# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html
# Catch a ClientError since AWS Service Errors are defined dynamically
error = (FileNotFoundError, botocore.exceptions.ClientError)
with pytest.raises(error, match="The specified bucket does not exist"):
tips_df.to_parquet(
"s3://an_s3_bucket_data_doesnt_exit/not_real.parquet",
storage_options=s3so,
)
@pytest.mark.single_cpu
def test_read_csv_handles_boto_s3_object(
self, s3_public_bucket_with_data, tips_file
):
# see gh-16135
s3_object = s3_public_bucket_with_data.Object("tips.csv")
with BytesIO(s3_object.get()["Body"].read()) as buffer:
result = read_csv(buffer, encoding="utf8")
assert isinstance(result, DataFrame)
assert not result.empty
expected = read_csv(tips_file)
tm.assert_frame_equal(result, expected)
@pytest.mark.single_cpu
@pytest.mark.skipif(
is_ci_environment(),
reason="GH: 45651: This test can hang in our CI min_versions build",
)
def test_read_csv_chunked_download(self, s3_public_bucket, caplog, s3so):
# 8 MB, S3FS uses 5MB chunks
import s3fs
df = DataFrame(
np.random.default_rng(2).standard_normal((100000, 4)), columns=list("abcd")
)
str_buf = StringIO()
df.to_csv(str_buf)
buf = BytesIO(str_buf.getvalue().encode("utf-8"))
s3_public_bucket.put_object(Key="large-file.csv", Body=buf)
# Possibly some state leaking in between tests.
# If we don't clear this cache, we saw `GetObject operation: Forbidden`.
# Presumably the s3fs instance is being cached, with the directory listing
# from *before* we add the large-file.csv in the s3_public_bucket_with_data.
s3fs.S3FileSystem.clear_instance_cache()
with caplog.at_level(logging.DEBUG, logger="s3fs"):
read_csv(
f"s3://{s3_public_bucket.name}/large-file.csv",
nrows=5,
storage_options=s3so,
)
# log of fetch_range (start, stop)
assert (0, 5505024) in (x.args[-2:] for x in caplog.records)
def test_read_s3_with_hash_in_key(self, s3_public_bucket_with_data, tips_df, s3so):
# GH 25945
result = read_csv(
f"s3://{s3_public_bucket_with_data.name}/tips#1.csv", storage_options=s3so
)
tm.assert_frame_equal(tips_df, result)
def test_read_feather_s3_file_path(
self, s3_public_bucket_with_data, feather_file, s3so
):
# GH 29055
pytest.importorskip("pyarrow")
expected = read_feather(feather_file)
res = read_feather(
f"s3://{s3_public_bucket_with_data.name}/simple_dataset.feather",
storage_options=s3so,
)
tm.assert_frame_equal(expected, res)

View File

@@ -0,0 +1,562 @@
"""
Tests that apply specifically to the Python parser. Unless specifically
stated as a Python-specific issue, the goal is to eventually move as many of
these tests out of this module as soon as the C parser can accept further
arguments when parsing.
"""
from __future__ import annotations
import csv
from io import (
BytesIO,
StringIO,
TextIOWrapper,
)
from typing import TYPE_CHECKING
import numpy as np
import pytest
from pandas.errors import (
ParserError,
ParserWarning,
)
from pandas import (
DataFrame,
Index,
MultiIndex,
)
import pandas._testing as tm
if TYPE_CHECKING:
from collections.abc import Iterator
def test_default_separator(python_parser_only):
# see gh-17333
#
# csv.Sniffer in Python treats "o" as separator.
data = "aob\n1o2\n3o4"
parser = python_parser_only
expected = DataFrame({"a": [1, 3], "b": [2, 4]})
result = parser.read_csv(StringIO(data), sep=None)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("skipfooter", ["foo", 1.5, True])
def test_invalid_skipfooter_non_int(python_parser_only, skipfooter):
# see gh-15925 (comment)
data = "a\n1\n2"
parser = python_parser_only
msg = "skipfooter must be an integer"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), skipfooter=skipfooter)
def test_invalid_skipfooter_negative(python_parser_only):
# see gh-15925 (comment)
data = "a\n1\n2"
parser = python_parser_only
msg = "skipfooter cannot be negative"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), skipfooter=-1)
@pytest.mark.parametrize("kwargs", [{"sep": None}, {"delimiter": "|"}])
def test_sniff_delimiter(python_parser_only, kwargs):
data = """index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
"""
parser = python_parser_only
result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=["A", "B", "C"],
index=Index(["foo", "bar", "baz"], name="index"),
)
tm.assert_frame_equal(result, expected)
def test_sniff_delimiter_comment(python_parser_only):
data = """# comment line
index|A|B|C
# comment line
foo|1|2|3 # ignore | this
bar|4|5|6
baz|7|8|9
"""
parser = python_parser_only
result = parser.read_csv(StringIO(data), index_col=0, sep=None, comment="#")
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=["A", "B", "C"],
index=Index(["foo", "bar", "baz"], name="index"),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("encoding", [None, "utf-8"])
def test_sniff_delimiter_encoding(python_parser_only, encoding):
parser = python_parser_only
data = """ignore this
ignore this too
index|A|B|C
foo|1|2|3
bar|4|5|6
baz|7|8|9
"""
if encoding is not None:
data = data.encode(encoding)
data = BytesIO(data)
data = TextIOWrapper(data, encoding=encoding)
else:
data = StringIO(data)
result = parser.read_csv(data, index_col=0, sep=None, skiprows=2, encoding=encoding)
expected = DataFrame(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
columns=["A", "B", "C"],
index=Index(["foo", "bar", "baz"], name="index"),
)
tm.assert_frame_equal(result, expected)
def test_single_line(python_parser_only):
# see gh-6607: sniff separator
parser = python_parser_only
result = parser.read_csv(StringIO("1,2"), names=["a", "b"], header=None, sep=None)
expected = DataFrame({"a": [1], "b": [2]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("kwargs", [{"skipfooter": 2}, {"nrows": 3}])
def test_skipfooter(python_parser_only, kwargs):
# see gh-6607
data = """A,B,C
1,2,3
4,5,6
7,8,9
want to skip this
also also skip this
"""
parser = python_parser_only
result = parser.read_csv(StringIO(data), **kwargs)
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"compression,klass", [("gzip", "GzipFile"), ("bz2", "BZ2File")]
)
def test_decompression_regex_sep(python_parser_only, csv1, compression, klass):
# see gh-6607
parser = python_parser_only
with open(csv1, "rb") as f:
data = f.read()
data = data.replace(b",", b"::")
expected = parser.read_csv(csv1)
module = pytest.importorskip(compression)
klass = getattr(module, klass)
with tm.ensure_clean() as path:
with klass(path, mode="wb") as tmp:
tmp.write(data)
result = parser.read_csv(path, sep="::", compression=compression)
tm.assert_frame_equal(result, expected)
def test_read_csv_buglet_4x_multi_index(python_parser_only):
# see gh-6607
data = """ A B C D E
one two three four
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
parser = python_parser_only
expected = DataFrame(
[
[-0.5109, -2.3358, -0.4645, 0.05076, 0.3640],
[0.4473, 1.4152, 0.2834, 1.00661, 0.1744],
[-0.6662, -0.5243, -0.3580, 0.89145, 2.5838],
],
columns=["A", "B", "C", "D", "E"],
index=MultiIndex.from_tuples(
[("a", "b", 10.0032, 5), ("a", "q", 20, 4), ("x", "q", 30, 3)],
names=["one", "two", "three", "four"],
),
)
result = parser.read_csv(StringIO(data), sep=r"\s+")
tm.assert_frame_equal(result, expected)
def test_read_csv_buglet_4x_multi_index2(python_parser_only):
# see gh-6893
data = " A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9"
parser = python_parser_only
expected = DataFrame.from_records(
[(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)],
columns=list("abcABC"),
index=list("abc"),
)
result = parser.read_csv(StringIO(data), sep=r"\s+")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("add_footer", [True, False])
def test_skipfooter_with_decimal(python_parser_only, add_footer):
# see gh-6971
data = "1#2\n3#4"
parser = python_parser_only
expected = DataFrame({"a": [1.2, 3.4]})
if add_footer:
# The stray footer line should not mess with the
# casting of the first two lines if we skip it.
kwargs = {"skipfooter": 1}
data += "\nFooter"
else:
kwargs = {}
result = parser.read_csv(StringIO(data), names=["a"], decimal="#", **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"sep", ["::", "#####", "!!!", "123", "#1!c5", "%!c!d", "@@#4:2", "_!pd#_"]
)
@pytest.mark.parametrize(
"encoding", ["utf-16", "utf-16-be", "utf-16-le", "utf-32", "cp037"]
)
def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding):
# see gh-3404
expected = DataFrame({"a": [1], "b": [2]})
parser = python_parser_only
data = "1" + sep + "2"
encoded_data = data.encode(encoding)
result = parser.read_csv(
BytesIO(encoded_data), sep=sep, names=["a", "b"], encoding=encoding
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
def test_multi_char_sep_quotes(python_parser_only, quoting):
# see gh-13374
kwargs = {"sep": ",,"}
parser = python_parser_only
data = 'a,,b\n1,,a\n2,,"2,,b"'
if quoting == csv.QUOTE_NONE:
msg = "Expected 2 fields in line 3, saw 3"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
else:
msg = "ignored when a multi-char delimiter is used"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
def test_none_delimiter(python_parser_only, capsys):
# see gh-13374 and gh-17465
parser = python_parser_only
data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9"
expected = DataFrame({"a": [0, 7], "b": [1, 8], "c": [2, 9]})
# We expect the third line in the data to be
# skipped because it is malformed, but we do
# not expect any errors to occur.
result = parser.read_csv(StringIO(data), header=0, sep=None, on_bad_lines="warn")
tm.assert_frame_equal(result, expected)
captured = capsys.readouterr()
assert "Skipping line 3" in captured.err
@pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz'])
@pytest.mark.parametrize("skipfooter", [0, 1])
def test_skipfooter_bad_row(python_parser_only, data, skipfooter):
# see gh-13879 and gh-15910
parser = python_parser_only
if skipfooter:
msg = "parsing errors in the skipped footer rows"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), skipfooter=skipfooter)
else:
msg = "unexpected end of data|expected after"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), skipfooter=skipfooter)
def test_malformed_skipfooter(python_parser_only):
parser = python_parser_only
data = """ignore
A,B,C
1,2,3 # comment
1,2,3,4,5
2,3,4
footer
"""
msg = "Expected 3 fields in line 4, saw 5"
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1)
def test_python_engine_file_no_next(python_parser_only):
parser = python_parser_only
class NoNextBuffer:
def __init__(self, csv_data) -> None:
self.data = csv_data
def __iter__(self) -> Iterator:
return self.data.__iter__()
def read(self):
return self.data
def readline(self):
return self.data
parser.read_csv(NoNextBuffer("a\n1"))
@pytest.mark.parametrize("bad_line_func", [lambda x: ["2", "3"], lambda x: x[:2]])
def test_on_bad_lines_callable(python_parser_only, bad_line_func):
# GH 5686
parser = python_parser_only
data = """a,b
1,2
2,3,4,5,6
3,4
"""
bad_sio = StringIO(data)
result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
tm.assert_frame_equal(result, expected)
def test_on_bad_lines_callable_write_to_external_list(python_parser_only):
# GH 5686
parser = python_parser_only
data = """a,b
1,2
2,3,4,5,6
3,4
"""
bad_sio = StringIO(data)
lst = []
def bad_line_func(bad_line: list[str]) -> list[str]:
lst.append(bad_line)
return ["2", "3"]
result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
tm.assert_frame_equal(result, expected)
assert lst == [["2", "3", "4", "5", "6"]]
@pytest.mark.parametrize("bad_line_func", [lambda x: ["foo", "bar"], lambda x: x[:2]])
@pytest.mark.parametrize("sep", [",", "111"])
def test_on_bad_lines_callable_iterator_true(python_parser_only, bad_line_func, sep):
# GH 5686
# iterator=True has a separate code path than iterator=False
parser = python_parser_only
data = f"""
0{sep}1
hi{sep}there
foo{sep}bar{sep}baz
good{sep}bye
"""
bad_sio = StringIO(data)
result_iter = parser.read_csv(
bad_sio, on_bad_lines=bad_line_func, chunksize=1, iterator=True, sep=sep
)
expecteds = [
{"0": "hi", "1": "there"},
{"0": "foo", "1": "bar"},
{"0": "good", "1": "bye"},
]
for i, (result, expected) in enumerate(zip(result_iter, expecteds)):
expected = DataFrame(expected, index=range(i, i + 1))
tm.assert_frame_equal(result, expected)
def test_on_bad_lines_callable_dont_swallow_errors(python_parser_only):
# GH 5686
parser = python_parser_only
data = """a,b
1,2
2,3,4,5,6
3,4
"""
bad_sio = StringIO(data)
msg = "This function is buggy."
def bad_line_func(bad_line):
raise ValueError(msg)
with pytest.raises(ValueError, match=msg):
parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
def test_on_bad_lines_callable_not_expected_length(python_parser_only):
# GH 5686
parser = python_parser_only
data = """a,b
1,2
2,3,4,5,6
3,4
"""
bad_sio = StringIO(data)
result = parser.read_csv_check_warnings(
ParserWarning, "Length of header or names", bad_sio, on_bad_lines=lambda x: x
)
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
tm.assert_frame_equal(result, expected)
def test_on_bad_lines_callable_returns_none(python_parser_only):
# GH 5686
parser = python_parser_only
data = """a,b
1,2
2,3,4,5,6
3,4
"""
bad_sio = StringIO(data)
result = parser.read_csv(bad_sio, on_bad_lines=lambda x: None)
expected = DataFrame({"a": [1, 3], "b": [2, 4]})
tm.assert_frame_equal(result, expected)
def test_on_bad_lines_index_col_inferred(python_parser_only):
# GH 5686
parser = python_parser_only
data = """a,b
1,2,3
4,5,6
"""
bad_sio = StringIO(data)
result = parser.read_csv(bad_sio, on_bad_lines=lambda x: ["99", "99"])
expected = DataFrame({"a": [2, 5], "b": [3, 6]}, index=[1, 4])
tm.assert_frame_equal(result, expected)
def test_index_col_false_and_header_none(python_parser_only):
# GH#46955
parser = python_parser_only
data = """
0.5,0.03
0.1,0.2,0.3,2
"""
result = parser.read_csv_check_warnings(
ParserWarning,
"Length of header",
StringIO(data),
sep=",",
header=None,
index_col=False,
)
expected = DataFrame({0: [0.5, 0.1], 1: [0.03, 0.2]})
tm.assert_frame_equal(result, expected)
def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parser_only):
# GH#46569
parser = python_parser_only
data = StringIO("a\na,b\nc,d,e\nf,g,h")
result = parser.read_csv_check_warnings(
ParserWarning, "Length of header", data, engine="python", index_col=False
)
expected = DataFrame({"a": ["a", "c", "f"]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}]
)
def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, dtype):
# GH#50270
parser = python_parser_only
data = """\
a;b;c
0000.7995;16.000;0
3.03.001.00514;0;4.000
4923.600.041;23.000;131"""
result = parser.read_csv(
StringIO(data),
sep=";",
dtype=dtype,
thousands=".",
)
expected = DataFrame(
{
"a": ["0000.7995", "3.03.001.00514", "4923.600.041"],
"b": [16000, 0, 23000],
"c": [0, 4000, 131],
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"dtype,expected",
[
(
{"a": str, "b": np.float64, "c": np.int64},
DataFrame(
{
"b": [16000.1, 0, 23000],
"c": [0, 4001, 131],
}
),
),
(
str,
DataFrame(
{
"b": ["16,000.1", "0", "23,000"],
"c": ["0", "4,001", "131"],
}
),
),
],
)
def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, expected):
# GH#50270
parser = python_parser_only
data = """a;b;c
0000,7995;16,000.1;0
3,03,001,00514;0;4,001
4923,600,041;23,000;131
"""
result = parser.read_csv(
StringIO(data),
sep=";",
dtype=dtype,
thousands=",",
)
expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"])
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,167 @@
"""
Tests that quoting specifications are properly handled
during parsing for all of the parsers defined in parsers.py
"""
import csv
from io import StringIO
import pytest
from pandas.compat import PY311
from pandas.errors import ParserError
from pandas import DataFrame
import pandas._testing as tm
pytestmark = pytest.mark.usefixtures("pyarrow_skip")
@pytest.mark.parametrize(
"kwargs,msg",
[
({"quotechar": "foo"}, '"quotechar" must be a(n)? 1-character string'),
(
{"quotechar": None, "quoting": csv.QUOTE_MINIMAL},
"quotechar must be set if quoting enabled",
),
({"quotechar": 2}, '"quotechar" must be string( or None)?, not int'),
],
)
def test_bad_quote_char(all_parsers, kwargs, msg):
data = "1,2,3"
parser = all_parsers
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
@pytest.mark.parametrize(
"quoting,msg",
[
("foo", '"quoting" must be an integer|Argument'),
(10, 'bad "quoting" value'), # quoting must be in the range [0, 3]
],
)
def test_bad_quoting(all_parsers, quoting, msg):
data = "1,2,3"
parser = all_parsers
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), quoting=quoting)
def test_quote_char_basic(all_parsers):
parser = all_parsers
data = 'a,b,c\n1,2,"cat"'
expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
result = parser.read_csv(StringIO(data), quotechar='"')
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"])
def test_quote_char_various(all_parsers, quote_char):
parser = all_parsers
expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
data = 'a,b,c\n1,2,"cat"'
new_data = data.replace('"', quote_char)
result = parser.read_csv(StringIO(new_data), quotechar=quote_char)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
@pytest.mark.parametrize("quote_char", ["", None])
def test_null_quote_char(all_parsers, quoting, quote_char):
kwargs = {"quotechar": quote_char, "quoting": quoting}
data = "a,b,c\n1,2,3"
parser = all_parsers
if quoting != csv.QUOTE_NONE:
# Sanity checking.
msg = (
'"quotechar" must be a 1-character string'
if PY311 and all_parsers.engine == "python" and quote_char == ""
else "quotechar must be set if quoting enabled"
)
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
elif not (PY311 and all_parsers.engine == "python"):
# Python 3.11+ doesn't support null/blank quote chars in their csv parsers
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,exp_data",
[
({}, [[1, 2, "foo"]]), # Test default.
# QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
({"quotechar": '"', "quoting": csv.QUOTE_MINIMAL}, [[1, 2, "foo"]]),
# QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
({"quotechar": '"', "quoting": csv.QUOTE_ALL}, [[1, 2, "foo"]]),
# QUOTE_NONE tells the reader to do no special handling
# of quote characters and leave them alone.
({"quotechar": '"', "quoting": csv.QUOTE_NONE}, [[1, 2, '"foo"']]),
# QUOTE_NONNUMERIC tells the reader to cast
# all non-quoted fields to float
({"quotechar": '"', "quoting": csv.QUOTE_NONNUMERIC}, [[1.0, 2.0, "foo"]]),
],
)
def test_quoting_various(all_parsers, kwargs, exp_data):
data = '1,2,"foo"'
parser = all_parsers
columns = ["a", "b", "c"]
result = parser.read_csv(StringIO(data), names=columns, **kwargs)
expected = DataFrame(exp_data, columns=columns)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])]
)
def test_double_quote(all_parsers, doublequote, exp_data):
parser = all_parsers
data = 'a,b\n3,"4 "" 5"'
result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote)
expected = DataFrame(exp_data, columns=["a", "b"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("quotechar", ['"', "\u0001"])
def test_quotechar_unicode(all_parsers, quotechar):
# see gh-14477
data = "a\n1"
parser = all_parsers
expected = DataFrame({"a": [1]})
result = parser.read_csv(StringIO(data), quotechar=quotechar)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("balanced", [True, False])
def test_unbalanced_quoting(all_parsers, balanced):
# see gh-22789.
parser = all_parsers
data = 'a,b,c\n1,2,"3'
if balanced:
# Re-balance the quoting and read in without errors.
expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
result = parser.read_csv(StringIO(data + '"'))
tm.assert_frame_equal(result, expected)
else:
msg = (
"EOF inside string starting at row 1"
if parser.engine == "c"
else "unexpected end of data"
)
with pytest.raises(ParserError, match=msg):
parser.read_csv(StringIO(data))

View File

@@ -0,0 +1,288 @@
"""
Tests that skipped rows are properly handled during
parsing for all of the parsers defined in parsers.py
"""
from datetime import datetime
from io import StringIO
import numpy as np
import pytest
from pandas.errors import EmptyDataError
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm
# XFAIL ME PLS once hanging tests issues identified
pytestmark = pytest.mark.usefixtures("pyarrow_skip")
@pytest.mark.parametrize("skiprows", [list(range(6)), 6])
def test_skip_rows_bug(all_parsers, skiprows):
# see gh-505
parser = all_parsers
text = """#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
1/1/2000,1.,2.,3.
1/2/2000,4,5,6
1/3/2000,7,8,9
"""
result = parser.read_csv(
StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True
)
index = Index(
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
)
expected = DataFrame(
np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
)
tm.assert_frame_equal(result, expected)
def test_deep_skip_rows(all_parsers):
# see gh-4382
parser = all_parsers
data = "a,b,c\n" + "\n".join(
[",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)]
)
condensed_data = "a,b,c\n" + "\n".join(
[",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]]
)
result = parser.read_csv(StringIO(data), skiprows=[6, 8])
condensed_result = parser.read_csv(StringIO(condensed_data))
tm.assert_frame_equal(result, condensed_result)
def test_skip_rows_blank(all_parsers):
# see gh-9832
parser = all_parsers
text = """#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
#foo,a,b,c
1/1/2000,1.,2.,3.
1/2/2000,4,5,6
1/3/2000,7,8,9
"""
data = parser.read_csv(
StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True
)
index = Index(
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
)
expected = DataFrame(
np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
)
tm.assert_frame_equal(data, expected)
@pytest.mark.parametrize(
"data,kwargs,expected",
[
(
"""id,text,num_lines
1,"line 11
line 12",2
2,"line 21
line 22",2
3,"line 31",1""",
{"skiprows": [1]},
DataFrame(
[[2, "line 21\nline 22", 2], [3, "line 31", 1]],
columns=["id", "text", "num_lines"],
),
),
(
"a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~",
{"quotechar": "~", "skiprows": [2]},
DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]),
),
(
(
"Text,url\n~example\n "
"sentence\n one~,url1\n~"
"example\n sentence\n two~,url2\n~"
"example\n sentence\n three~,url3"
),
{"quotechar": "~", "skiprows": [1, 3]},
DataFrame([["example\n sentence\n two", "url2"]], columns=["Text", "url"]),
),
],
)
def test_skip_row_with_newline(all_parsers, data, kwargs, expected):
# see gh-12775 and gh-10911
parser = all_parsers
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
def test_skip_row_with_quote(all_parsers):
# see gh-12775 and gh-10911
parser = all_parsers
data = """id,text,num_lines
1,"line '11' line 12",2
2,"line '21' line 22",2
3,"line '31' line 32",1"""
exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]]
expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
result = parser.read_csv(StringIO(data), skiprows=[1])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data,exp_data",
[
(
"""id,text,num_lines
1,"line \n'11' line 12",2
2,"line \n'21' line 22",2
3,"line \n'31' line 32",1""",
[[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]],
),
(
"""id,text,num_lines
1,"line '11\n' line 12",2
2,"line '21\n' line 22",2
3,"line '31\n' line 32",1""",
[[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]],
),
(
"""id,text,num_lines
1,"line '11\n' \r\tline 12",2
2,"line '21\n' \r\tline 22",2
3,"line '31\n' \r\tline 32",1""",
[[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]],
),
],
)
def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data):
# see gh-12775 and gh-10911
parser = all_parsers
result = parser.read_csv(StringIO(data), skiprows=[1])
expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"lineterminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR"
)
def test_skiprows_lineterminator(all_parsers, lineterminator, request):
# see gh-9079
parser = all_parsers
data = "\n".join(
[
"SMOSMANIA ThetaProbe-ML2X ",
"2007/01/01 01:00 0.2140 U M ",
"2007/01/01 02:00 0.2141 M O ",
"2007/01/01 04:00 0.2142 D M ",
]
)
expected = DataFrame(
[
["2007/01/01", "01:00", 0.2140, "U", "M"],
["2007/01/01", "02:00", 0.2141, "M", "O"],
["2007/01/01", "04:00", 0.2142, "D", "M"],
],
columns=["date", "time", "var", "flag", "oflag"],
)
if parser.engine == "python" and lineterminator == "\r":
mark = pytest.mark.xfail(reason="'CR' not respect with the Python parser yet")
request.node.add_marker(mark)
data = data.replace("\n", lineterminator)
result = parser.read_csv(
StringIO(data),
skiprows=1,
delim_whitespace=True,
names=["date", "time", "var", "flag", "oflag"],
)
tm.assert_frame_equal(result, expected)
def test_skiprows_infield_quote(all_parsers):
# see gh-14459
parser = all_parsers
data = 'a"\nb"\na\n1'
expected = DataFrame({"a": [1]})
result = parser.read_csv(StringIO(data), skiprows=2)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"kwargs,expected",
[
({}, DataFrame({"1": [3, 5]})),
({"header": 0, "names": ["foo"]}, DataFrame({"foo": [3, 5]})),
],
)
def test_skip_rows_callable(all_parsers, kwargs, expected):
parser = all_parsers
data = "a\n1\n2\n3\n4\n5"
result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs)
tm.assert_frame_equal(result, expected)
def test_skip_rows_callable_not_in(all_parsers):
parser = all_parsers
data = "0,a\n1,b\n2,c\n3,d\n4,e"
expected = DataFrame([[1, "b"], [3, "d"]])
result = parser.read_csv(
StringIO(data), header=None, skiprows=lambda x: x not in [1, 3]
)
tm.assert_frame_equal(result, expected)
def test_skip_rows_skip_all(all_parsers):
parser = all_parsers
data = "a\n1\n2\n3\n4\n5"
msg = "No columns to parse from file"
with pytest.raises(EmptyDataError, match=msg):
parser.read_csv(StringIO(data), skiprows=lambda x: True)
def test_skip_rows_bad_callable(all_parsers):
msg = "by zero"
parser = all_parsers
data = "a\n1\n2\n3\n4\n5"
with pytest.raises(ZeroDivisionError, match=msg):
parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0)
def test_skip_rows_and_n_rows(all_parsers):
# GH#44021
data = """a,b
1,a
2,b
3,c
4,d
5,e
6,f
7,g
8,h
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6])
expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]})
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,343 @@
"""
Tests the TextReader class in parsers.pyx, which
is integral to the C engine in parsers.py
"""
from io import (
BytesIO,
StringIO,
)
import numpy as np
import pytest
import pandas._libs.parsers as parser
from pandas._libs.parsers import TextReader
from pandas import DataFrame
import pandas._testing as tm
from pandas.io.parsers import (
TextFileReader,
read_csv,
)
from pandas.io.parsers.c_parser_wrapper import ensure_dtype_objs
class TestTextReader:
@pytest.fixture
def csv_path(self, datapath):
return datapath("io", "data", "csv", "test1.csv")
def test_file_handle(self, csv_path):
with open(csv_path, "rb") as f:
reader = TextReader(f)
reader.read()
def test_file_handle_mmap(self, csv_path):
# this was never using memory_map=True
with open(csv_path, "rb") as f:
reader = TextReader(f, header=None)
reader.read()
def test_StringIO(self, csv_path):
with open(csv_path, "rb") as f:
text = f.read()
src = BytesIO(text)
reader = TextReader(src, header=None)
reader.read()
def test_string_factorize(self):
# should this be optional?
data = "a\nb\na\nb\na"
reader = TextReader(StringIO(data), header=None)
result = reader.read()
assert len(set(map(id, result[0]))) == 2
def test_skipinitialspace(self):
data = "a, b\na, b\na, b\na, b"
reader = TextReader(StringIO(data), skipinitialspace=True, header=None)
result = reader.read()
tm.assert_numpy_array_equal(
result[0], np.array(["a", "a", "a", "a"], dtype=np.object_)
)
tm.assert_numpy_array_equal(
result[1], np.array(["b", "b", "b", "b"], dtype=np.object_)
)
def test_parse_booleans(self):
data = "True\nFalse\nTrue\nTrue"
reader = TextReader(StringIO(data), header=None)
result = reader.read()
assert result[0].dtype == np.bool_
def test_delimit_whitespace(self):
data = 'a b\na\t\t "b"\n"a"\t \t b'
reader = TextReader(StringIO(data), delim_whitespace=True, header=None)
result = reader.read()
tm.assert_numpy_array_equal(
result[0], np.array(["a", "a", "a"], dtype=np.object_)
)
tm.assert_numpy_array_equal(
result[1], np.array(["b", "b", "b"], dtype=np.object_)
)
def test_embedded_newline(self):
data = 'a\n"hello\nthere"\nthis'
reader = TextReader(StringIO(data), header=None)
result = reader.read()
expected = np.array(["a", "hello\nthere", "this"], dtype=np.object_)
tm.assert_numpy_array_equal(result[0], expected)
def test_euro_decimal(self):
data = "12345,67\n345,678"
reader = TextReader(StringIO(data), delimiter=":", decimal=",", header=None)
result = reader.read()
expected = np.array([12345.67, 345.678])
tm.assert_almost_equal(result[0], expected)
def test_integer_thousands(self):
data = "123,456\n12,500"
reader = TextReader(StringIO(data), delimiter=":", thousands=",", header=None)
result = reader.read()
expected = np.array([123456, 12500], dtype=np.int64)
tm.assert_almost_equal(result[0], expected)
def test_integer_thousands_alt(self):
data = "123.456\n12.500"
reader = TextFileReader(
StringIO(data), delimiter=":", thousands=".", header=None
)
result = reader.read()
expected = DataFrame([123456, 12500])
tm.assert_frame_equal(result, expected)
def test_skip_bad_lines(self, capsys):
# too many lines, see #2430 for why
data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r"
reader = TextReader(StringIO(data), delimiter=":", header=None)
msg = r"Error tokenizing data\. C error: Expected 3 fields in line 4, saw 4"
with pytest.raises(parser.ParserError, match=msg):
reader.read()
reader = TextReader(
StringIO(data), delimiter=":", header=None, on_bad_lines=2 # Skip
)
result = reader.read()
expected = {
0: np.array(["a", "d", "g", "l"], dtype=object),
1: np.array(["b", "e", "h", "m"], dtype=object),
2: np.array(["c", "f", "i", "n"], dtype=object),
}
assert_array_dicts_equal(result, expected)
reader = TextReader(
StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn
)
reader.read()
captured = capsys.readouterr()
assert "Skipping line 4" in captured.err
assert "Skipping line 6" in captured.err
def test_header_not_enough_lines(self):
data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6"
reader = TextReader(StringIO(data), delimiter=",", header=2)
header = reader.header
expected = [["a", "b", "c"]]
assert header == expected
recs = reader.read()
expected = {
0: np.array([1, 4], dtype=np.int64),
1: np.array([2, 5], dtype=np.int64),
2: np.array([3, 6], dtype=np.int64),
}
assert_array_dicts_equal(recs, expected)
def test_escapechar(self):
data = '\\"hello world"\n\\"hello world"\n\\"hello world"'
reader = TextReader(StringIO(data), delimiter=",", header=None, escapechar="\\")
result = reader.read()
expected = {0: np.array(['"hello world"'] * 3, dtype=object)}
assert_array_dicts_equal(result, expected)
def test_eof_has_eol(self):
# handling of new line at EOF
pass
def test_na_substitution(self):
pass
def test_numpy_string_dtype(self):
data = """\
a,1
aa,2
aaa,3
aaaa,4
aaaaa,5"""
def _make_reader(**kwds):
if "dtype" in kwds:
kwds["dtype"] = ensure_dtype_objs(kwds["dtype"])
return TextReader(StringIO(data), delimiter=",", header=None, **kwds)
reader = _make_reader(dtype="S5,i4")
result = reader.read()
assert result[0].dtype == "S5"
ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaaa"], dtype="S5")
assert (result[0] == ex_values).all()
assert result[1].dtype == "i4"
reader = _make_reader(dtype="S4")
result = reader.read()
assert result[0].dtype == "S4"
ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaa"], dtype="S4")
assert (result[0] == ex_values).all()
assert result[1].dtype == "S4"
def test_pass_dtype(self):
data = """\
one,two
1,a
2,b
3,c
4,d"""
def _make_reader(**kwds):
if "dtype" in kwds:
kwds["dtype"] = ensure_dtype_objs(kwds["dtype"])
return TextReader(StringIO(data), delimiter=",", **kwds)
reader = _make_reader(dtype={"one": "u1", 1: "S1"})
result = reader.read()
assert result[0].dtype == "u1"
assert result[1].dtype == "S1"
reader = _make_reader(dtype={"one": np.uint8, 1: object})
result = reader.read()
assert result[0].dtype == "u1"
assert result[1].dtype == "O"
reader = _make_reader(dtype={"one": np.dtype("u1"), 1: np.dtype("O")})
result = reader.read()
assert result[0].dtype == "u1"
assert result[1].dtype == "O"
def test_usecols(self):
data = """\
a,b,c
1,2,3
4,5,6
7,8,9
10,11,12"""
def _make_reader(**kwds):
return TextReader(StringIO(data), delimiter=",", **kwds)
reader = _make_reader(usecols=(1, 2))
result = reader.read()
exp = _make_reader().read()
assert len(result) == 2
assert (result[1] == exp[1]).all()
assert (result[2] == exp[2]).all()
@pytest.mark.parametrize(
"text, kwargs",
[
("a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12", {"delimiter": ","}),
(
"a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12",
{"delim_whitespace": True},
),
("a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12", {"delimiter": ","}),
(
(
"A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r"
"AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r"
",BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0"
),
{"delimiter": ","},
),
("A B C\r 2 3\r4 5 6", {"delim_whitespace": True}),
("A B C\r2 3\r4 5 6", {"delim_whitespace": True}),
],
)
def test_cr_delimited(self, text, kwargs):
nice_text = text.replace("\r", "\r\n")
result = TextReader(StringIO(text), **kwargs).read()
expected = TextReader(StringIO(nice_text), **kwargs).read()
assert_array_dicts_equal(result, expected)
def test_empty_field_eof(self):
data = "a,b,c\n1,2,3\n4,,"
result = TextReader(StringIO(data), delimiter=",").read()
expected = {
0: np.array([1, 4], dtype=np.int64),
1: np.array(["2", ""], dtype=object),
2: np.array(["3", ""], dtype=object),
}
assert_array_dicts_equal(result, expected)
# GH5664
a = DataFrame([["b"], [np.nan]], columns=["a"], index=["a", "c"])
b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list("abcd"), index=[1, 1])
c = DataFrame(
[
[1, 2, 3, 4],
[6, np.nan, np.nan, np.nan],
[8, 9, 10, 11],
[13, 14, np.nan, np.nan],
],
columns=list("abcd"),
index=[0, 5, 7, 12],
)
for _ in range(100):
df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c")
tm.assert_frame_equal(df, a)
df = read_csv(
StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c"
)
tm.assert_frame_equal(df, b)
df = read_csv(
StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"),
names=list("abcd"),
engine="c",
)
tm.assert_frame_equal(df, c)
def test_empty_csv_input(self):
# GH14867
with read_csv(
StringIO(), chunksize=20, header=None, names=["a", "b", "c"]
) as df:
assert isinstance(df, TextFileReader)
def assert_array_dicts_equal(left, right):
for k, v in left.items():
tm.assert_numpy_array_equal(np.asarray(v), np.asarray(right[k]))

View File

@@ -0,0 +1,212 @@
"""
Tests that features that are currently unsupported in
either the Python or C parser are actually enforced
and are clearly communicated to the user.
Ultimately, the goal is to remove test cases from this
test suite as new feature support is added to the parsers.
"""
from io import StringIO
import os
from pathlib import Path
import pytest
from pandas.compat import (
is_ci_environment,
is_platform_mac,
is_platform_windows,
)
from pandas.errors import ParserError
import pandas._testing as tm
from pandas.io.parsers import read_csv
import pandas.io.parsers.readers as parsers
@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val)
def python_engine(request):
return request.param
class TestUnsupportedFeatures:
def test_mangle_dupe_cols_false(self):
# see gh-12935
data = "a b c\n1 2 3"
for engine in ("c", "python"):
with pytest.raises(TypeError, match="unexpected keyword"):
read_csv(StringIO(data), engine=engine, mangle_dupe_cols=True)
def test_c_engine(self):
# see gh-6607
data = "a b c\n1 2 3"
msg = "does not support"
# specify C engine with unsupported options (raise)
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False)
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine="c", sep=r"\s")
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine="c", sep="\t", quotechar=chr(128))
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine="c", skipfooter=1)
# specify C-unsupported options without python-unsupported options
with tm.assert_produces_warning(parsers.ParserWarning):
read_csv(StringIO(data), sep=None, delim_whitespace=False)
with tm.assert_produces_warning(parsers.ParserWarning):
read_csv(StringIO(data), sep=r"\s")
with tm.assert_produces_warning(parsers.ParserWarning):
read_csv(StringIO(data), sep="\t", quotechar=chr(128))
with tm.assert_produces_warning(parsers.ParserWarning):
read_csv(StringIO(data), skipfooter=1)
text = """ A B C D E
one two three four
a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
msg = "Error tokenizing data"
with pytest.raises(ParserError, match=msg):
read_csv(StringIO(text), sep="\\s+")
with pytest.raises(ParserError, match=msg):
read_csv(StringIO(text), engine="c", sep="\\s+")
msg = "Only length-1 thousands markers supported"
data = """A|B|C
1|2,334|5
10|13|10.
"""
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), thousands=",,")
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), thousands="")
msg = "Only length-1 line terminators supported"
data = "a,b,c~~1,2,3~~4,5,6"
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), lineterminator="~~")
def test_python_engine(self, python_engine):
from pandas.io.parsers.readers import _python_unsupported as py_unsupported
data = """1,2,3,,
1,2,3,4,
1,2,3,4,5
1,2,,,
1,2,3,4,"""
for default in py_unsupported:
msg = (
f"The {repr(default)} option is not "
f"supported with the {repr(python_engine)} engine"
)
kwargs = {default: object()}
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine=python_engine, **kwargs)
def test_python_engine_file_no_iter(self, python_engine):
# see gh-16530
class NoNextBuffer:
def __init__(self, csv_data) -> None:
self.data = csv_data
def __next__(self):
return self.data.__next__()
def read(self):
return self.data
def readline(self):
return self.data
data = "a\n1"
msg = "'NoNextBuffer' object is not iterable|argument 1 must be an iterator"
with pytest.raises(TypeError, match=msg):
read_csv(NoNextBuffer(data), engine=python_engine)
def test_pyarrow_engine(self):
from pandas.io.parsers.readers import _pyarrow_unsupported as pa_unsupported
data = """1,2,3,,
1,2,3,4,
1,2,3,4,5
1,2,,,
1,2,3,4,"""
for default in pa_unsupported:
msg = (
f"The {repr(default)} option is not "
f"supported with the 'pyarrow' engine"
)
kwargs = {default: object()}
default_needs_bool = {"warn_bad_lines", "error_bad_lines"}
if default == "dialect":
kwargs[default] = "excel" # test a random dialect
elif default in default_needs_bool:
kwargs[default] = True
elif default == "on_bad_lines":
kwargs[default] = "warn"
with pytest.raises(ValueError, match=msg):
read_csv(StringIO(data), engine="pyarrow", **kwargs)
def test_on_bad_lines_callable_python_only(self, all_parsers):
# GH 5686
sio = StringIO("a,b\n1,2")
bad_lines_func = lambda x: x
parser = all_parsers
if all_parsers.engine != "python":
msg = "on_bad_line can only be a callable function if engine='python'"
with pytest.raises(ValueError, match=msg):
parser.read_csv(sio, on_bad_lines=bad_lines_func)
else:
parser.read_csv(sio, on_bad_lines=bad_lines_func)
def test_close_file_handle_on_invalid_usecols(all_parsers):
# GH 45384
parser = all_parsers
error = ValueError
if parser.engine == "pyarrow":
pyarrow = pytest.importorskip("pyarrow")
error = pyarrow.lib.ArrowKeyError
if is_ci_environment() and (is_platform_windows() or is_platform_mac()):
# GH#45547 causes timeouts on windows/mac builds
pytest.skip("GH#45547 causing timeouts on windows/mac builds 2022-01-22")
with tm.ensure_clean("test.csv") as fname:
Path(fname).write_text("col1,col2\na,b\n1,2", encoding="utf-8")
with tm.assert_produces_warning(False):
with pytest.raises(error, match="col3"):
parser.read_csv(fname, usecols=["col1", "col2", "col3"])
# unlink fails on windows if file handles still point to it
os.unlink(fname)
def test_invalid_file_inputs(request, all_parsers):
# GH#45957
parser = all_parsers
if parser.engine == "python":
request.node.add_marker(
pytest.mark.xfail(reason=f"{parser.engine} engine supports lists.")
)
with pytest.raises(ValueError, match="Invalid"):
parser.read_csv([])
def test_invalid_dtype_backend(all_parsers):
parser = all_parsers
msg = (
"dtype_backend numpy is invalid, only 'numpy_nullable' and "
"'pyarrow' are allowed."
)
with pytest.raises(ValueError, match=msg):
parser.read_csv("test", dtype_backend="numpy")

View File

@@ -0,0 +1,102 @@
import numpy as np
import pytest
from pandas._libs.parsers import (
_maybe_upcast,
na_values,
)
import pandas as pd
from pandas import NA
import pandas._testing as tm
from pandas.core.arrays import (
ArrowStringArray,
BooleanArray,
FloatingArray,
IntegerArray,
StringArray,
)
def test_maybe_upcast(any_real_numpy_dtype):
# GH#36712
dtype = np.dtype(any_real_numpy_dtype)
na_value = na_values[dtype]
arr = np.array([1, 2, na_value], dtype=dtype)
result = _maybe_upcast(arr, use_dtype_backend=True)
expected_mask = np.array([False, False, True])
if issubclass(dtype.type, np.integer):
expected = IntegerArray(arr, mask=expected_mask)
else:
expected = FloatingArray(arr, mask=expected_mask)
tm.assert_extension_array_equal(result, expected)
def test_maybe_upcast_no_na(any_real_numpy_dtype):
# GH#36712
arr = np.array([1, 2, 3], dtype=any_real_numpy_dtype)
result = _maybe_upcast(arr, use_dtype_backend=True)
expected_mask = np.array([False, False, False])
if issubclass(np.dtype(any_real_numpy_dtype).type, np.integer):
expected = IntegerArray(arr, mask=expected_mask)
else:
expected = FloatingArray(arr, mask=expected_mask)
tm.assert_extension_array_equal(result, expected)
def test_maybe_upcaste_bool():
# GH#36712
dtype = np.bool_
na_value = na_values[dtype]
arr = np.array([True, False, na_value], dtype="uint8").view(dtype)
result = _maybe_upcast(arr, use_dtype_backend=True)
expected_mask = np.array([False, False, True])
expected = BooleanArray(arr, mask=expected_mask)
tm.assert_extension_array_equal(result, expected)
def test_maybe_upcaste_bool_no_nan():
# GH#36712
dtype = np.bool_
arr = np.array([True, False, False], dtype="uint8").view(dtype)
result = _maybe_upcast(arr, use_dtype_backend=True)
expected_mask = np.array([False, False, False])
expected = BooleanArray(arr, mask=expected_mask)
tm.assert_extension_array_equal(result, expected)
def test_maybe_upcaste_all_nan():
# GH#36712
dtype = np.int64
na_value = na_values[dtype]
arr = np.array([na_value, na_value], dtype=dtype)
result = _maybe_upcast(arr, use_dtype_backend=True)
expected_mask = np.array([True, True])
expected = IntegerArray(arr, mask=expected_mask)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("val", [na_values[np.object_], "c"])
def test_maybe_upcast_object(val, string_storage):
# GH#36712
pa = pytest.importorskip("pyarrow")
with pd.option_context("mode.string_storage", string_storage):
arr = np.array(["a", "b", val], dtype=np.object_)
result = _maybe_upcast(arr, use_dtype_backend=True)
if string_storage == "python":
exp_val = "c" if val == "c" else NA
expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_))
else:
exp_val = "c" if val == "c" else None
expected = ArrowStringArray(pa.array(["a", "b", exp_val]))
tm.assert_extension_array_equal(result, expected)

View File

@@ -0,0 +1,150 @@
"""
Tests the usecols functionality during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import pytest
from pandas import (
DataFrame,
Index,
Timestamp,
)
import pandas._testing as tm
# TODO(1.4): Change these to xfails whenever parse_dates support(which was
# intentionally disable to keep small PR sizes) is added back
pytestmark = pytest.mark.usefixtures("pyarrow_skip")
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
def test_usecols_with_parse_dates(all_parsers, usecols):
# see gh-9755
data = """a,b,c,d,e
0,1,2014-01-01,09:00,4
0,1,2014-01-02,10:00,4"""
parser = all_parsers
parse_dates = [[1, 2]]
cols = {
"a": [0, 0],
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
}
expected = DataFrame(cols, columns=["c_d", "a"])
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
tm.assert_frame_equal(result, expected)
def test_usecols_with_parse_dates2(all_parsers):
# see gh-13604
parser = all_parsers
data = """2008-02-07 09:40,1032.43
2008-02-07 09:50,1042.54
2008-02-07 10:00,1051.65"""
names = ["date", "values"]
usecols = names[:]
parse_dates = [0]
index = Index(
[
Timestamp("2008-02-07 09:40"),
Timestamp("2008-02-07 09:50"),
Timestamp("2008-02-07 10:00"),
],
name="date",
)
cols = {"values": [1032.43, 1042.54, 1051.65]}
expected = DataFrame(cols, index=index)
result = parser.read_csv(
StringIO(data),
parse_dates=parse_dates,
index_col=0,
usecols=usecols,
header=None,
names=names,
)
tm.assert_frame_equal(result, expected)
def test_usecols_with_parse_dates3(all_parsers):
# see gh-14792
parser = all_parsers
data = """a,b,c,d,e,f,g,h,i,j
2016/09/21,1,1,2,3,4,5,6,7,8"""
usecols = list("abcdefghij")
parse_dates = [0]
cols = {
"a": Timestamp("2016-09-21").as_unit("ns"),
"b": [1],
"c": [1],
"d": [2],
"e": [3],
"f": [4],
"g": [5],
"h": [6],
"i": [7],
"j": [8],
}
expected = DataFrame(cols, columns=usecols)
result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates)
tm.assert_frame_equal(result, expected)
def test_usecols_with_parse_dates4(all_parsers):
data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"
usecols = list("abcdefghij")
parse_dates = [[0, 1]]
parser = all_parsers
cols = {
"a_b": "2016/09/21 1",
"c": [1],
"d": [2],
"e": [3],
"f": [4],
"g": [5],
"h": [6],
"i": [7],
"j": [8],
}
expected = DataFrame(cols, columns=["a_b"] + list("cdefghij"))
result = parser.read_csv(
StringIO(data),
usecols=usecols,
parse_dates=parse_dates,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]])
@pytest.mark.parametrize(
"names",
[
list("abcde"), # Names span all columns in original data.
list("acd"), # Names span only the selected columns.
],
)
def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names):
# see gh-9755
s = """0,1,2014-01-01,09:00,4
0,1,2014-01-02,10:00,4"""
parse_dates = [[1, 2]]
parser = all_parsers
cols = {
"a": [0, 0],
"c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
}
expected = DataFrame(cols, columns=["c_d", "a"])
result = parser.read_csv(
StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols
)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,92 @@
"""
Tests the usecols functionality during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import pytest
from pandas import DataFrame
import pandas._testing as tm
def test_usecols_with_unicode_strings(all_parsers):
# see gh-13219
data = """AAA,BBB,CCC,DDD
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
exp_data = {
"AAA": {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002,
},
"BBB": {0: 8, 1: 2, 2: 7},
}
expected = DataFrame(exp_data)
result = parser.read_csv(StringIO(data), usecols=["AAA", "BBB"])
tm.assert_frame_equal(result, expected)
def test_usecols_with_single_byte_unicode_strings(all_parsers):
# see gh-13219
data = """A,B,C,D
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
exp_data = {
"A": {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002,
},
"B": {0: 8, 1: 2, 2: 7},
}
expected = DataFrame(exp_data)
result = parser.read_csv(StringIO(data), usecols=["A", "B"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("usecols", [["AAA", b"BBB"], [b"AAA", "BBB"]])
def test_usecols_with_mixed_encoding_strings(all_parsers, usecols):
data = """AAA,BBB,CCC,DDD
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
_msg_validate_usecols_arg = (
"'usecols' must either be list-like "
"of all strings, all unicode, all "
"integers or a callable."
)
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
parser.read_csv(StringIO(data), usecols=usecols)
@pytest.mark.parametrize("usecols", [["あああ", "いい"], ["あああ", "いい"]])
def test_usecols_with_multi_byte_characters(all_parsers, usecols):
data = """あああ,いい,ううう,ええええ
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
exp_data = {
"あああ": {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002,
},
"いい": {0: 8, 1: 2, 2: 7},
}
expected = DataFrame(exp_data)
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,471 @@
"""
Tests the usecols functionality during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import numpy as np
import pytest
from pandas.errors import ParserError
from pandas import (
DataFrame,
Index,
array,
)
import pandas._testing as tm
_msg_validate_usecols_arg = (
"'usecols' must either be list-like "
"of all strings, all unicode, all "
"integers or a callable."
)
_msg_validate_usecols_names = (
"Usecols do not match columns, columns expected but not found: {0}"
)
# TODO: Switch to xfails
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
def test_raise_on_mixed_dtype_usecols(all_parsers):
# See gh-12678
data = """a,b,c
1000,2000,3000
4000,5000,6000
"""
usecols = [0, "b", 2]
parser = all_parsers
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
parser.read_csv(StringIO(data), usecols=usecols)
@skip_pyarrow
@pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")])
def test_usecols(all_parsers, usecols):
data = """\
a,b,c
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
result = parser.read_csv(StringIO(data), usecols=usecols)
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_usecols_with_names(all_parsers):
data = """\
a,b,c
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
names = ["foo", "bar"]
result = parser.read_csv(StringIO(data), names=names, usecols=[1, 2], header=0)
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=names)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize(
"names,usecols", [(["b", "c"], [1, 2]), (["a", "b", "c"], ["b", "c"])]
)
def test_usecols_relative_to_names(all_parsers, names, usecols):
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
result = parser.read_csv(StringIO(data), names=names, header=None, usecols=usecols)
expected = DataFrame([[2, 3], [5, 6], [8, 9], [11, 12]], columns=["b", "c"])
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_usecols_relative_to_names2(all_parsers):
# see gh-5766
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
result = parser.read_csv(
StringIO(data), names=["a", "b"], header=None, usecols=[0, 1]
)
expected = DataFrame([[1, 2], [4, 5], [7, 8], [10, 11]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_usecols_name_length_conflict(all_parsers):
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
parser = all_parsers
msg = "Number of passed names did not match number of header fields in the file"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), names=["a", "b"], header=None, usecols=[1])
def test_usecols_single_string(all_parsers):
# see gh-20558
parser = all_parsers
data = """foo, bar, baz
1000, 2000, 3000
4000, 5000, 6000"""
with pytest.raises(ValueError, match=_msg_validate_usecols_arg):
parser.read_csv(StringIO(data), usecols="foo")
@skip_pyarrow
@pytest.mark.parametrize(
"data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", "a,b,c,d\n1,2,3,4,\n5,6,7,8,"]
)
def test_usecols_index_col_false(all_parsers, data):
# see gh-9082
parser = all_parsers
usecols = ["a", "c", "d"]
expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]})
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize("index_col", ["b", 0])
@pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]])
def test_usecols_index_col_conflict(all_parsers, usecols, index_col):
# see gh-4201: test that index_col as integer reflects usecols
parser = all_parsers
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b"))
result = parser.read_csv(StringIO(data), usecols=usecols, index_col=index_col)
tm.assert_frame_equal(result, expected)
def test_usecols_index_col_conflict2(all_parsers):
# see gh-4201: test that index_col as integer reflects usecols
parser = all_parsers
data = "a,b,c,d\nA,a,1,one\nB,b,2,two"
expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")})
expected = expected.set_index(["b", "c"])
result = parser.read_csv(
StringIO(data), usecols=["b", "c", "d"], index_col=["b", "c"]
)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_usecols_implicit_index_col(all_parsers):
# see gh-2654
parser = all_parsers
data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10"
result = parser.read_csv(StringIO(data), usecols=["a", "b"])
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
tm.assert_frame_equal(result, expected)
def test_usecols_index_col_middle(all_parsers):
# GH#9098
parser = all_parsers
data = """a,b,c,d
1,2,3,4
"""
result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="c")
expected = DataFrame({"b": [2], "d": [4]}, index=Index([3], name="c"))
tm.assert_frame_equal(result, expected)
def test_usecols_index_col_end(all_parsers):
# GH#9098
parser = all_parsers
data = """a,b,c,d
1,2,3,4
"""
result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], index_col="d")
expected = DataFrame({"b": [2], "c": [3]}, index=Index([4], name="d"))
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_usecols_regex_sep(all_parsers):
# see gh-2733
parser = all_parsers
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b"))
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_usecols_with_whitespace(all_parsers):
parser = all_parsers
data = "a b c\n4 apple bat 5.7\n8 orange cow 10"
result = parser.read_csv(StringIO(data), delim_whitespace=True, usecols=("a", "b"))
expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8])
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize(
"usecols,expected",
[
# Column selection by index.
([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])),
# Column selection by name.
(
["0", "1"],
DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"]),
),
],
)
def test_usecols_with_integer_like_header(all_parsers, usecols, expected):
parser = all_parsers
data = """2,0,1
1000,2000,3000
4000,5000,6000"""
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_empty_usecols(all_parsers):
data = "a,b,c\n1,2,3\n4,5,6"
expected = DataFrame(columns=Index([]))
parser = all_parsers
result = parser.read_csv(StringIO(data), usecols=set())
tm.assert_frame_equal(result, expected)
def test_np_array_usecols(all_parsers):
# see gh-12546
parser = all_parsers
data = "a,b,c\n1,2,3"
usecols = np.array(["a", "b"])
expected = DataFrame([[1, 2]], columns=usecols)
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize(
"usecols,expected",
[
(
lambda x: x.upper() in ["AAA", "BBB", "DDD"],
DataFrame(
{
"AaA": {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002,
},
"bBb": {0: 8, 1: 2, 2: 7},
"ddd": {0: "a", 1: "b", 2: "a"},
}
),
),
(lambda x: False, DataFrame(columns=Index([]))),
],
)
def test_callable_usecols(all_parsers, usecols, expected):
# see gh-14154
data = """AaA,bBb,CCC,ddd
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a"""
parser = all_parsers
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]])
def test_incomplete_first_row(all_parsers, usecols):
# see gh-6710
data = "1,2\n1,2,3"
parser = all_parsers
names = ["a", "b", "c"]
expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]})
result = parser.read_csv(StringIO(data), names=names, usecols=usecols)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize(
"data,usecols,kwargs,expected",
[
# see gh-8985
(
"19,29,39\n" * 2 + "10,20,30,40",
[0, 1, 2],
{"header": None},
DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]]),
),
# see gh-9549
(
("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n1,2,3,,,1,\n1,2,3\n5,6,7"),
["A", "B", "C"],
{},
DataFrame(
{
"A": [1, 3, 1, 1, 1, 5],
"B": [2, 4, 2, 2, 2, 6],
"C": [3, 5, 4, 3, 3, 7],
}
),
),
],
)
def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected):
# see gh-8985
parser = all_parsers
result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize(
"usecols,kwargs,expected,msg",
[
(
["a", "b", "c", "d"],
{},
DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}),
None,
),
(
["a", "b", "c", "f"],
{},
None,
_msg_validate_usecols_names.format(r"\['f'\]"),
),
(["a", "b", "f"], {}, None, _msg_validate_usecols_names.format(r"\['f'\]")),
(
["a", "b", "f", "g"],
{},
None,
_msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]"),
),
# see gh-14671
(
None,
{"header": 0, "names": ["A", "B", "C", "D"]},
DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], "D": [4, 8]}),
None,
),
(
["A", "B", "C", "f"],
{"header": 0, "names": ["A", "B", "C", "D"]},
None,
_msg_validate_usecols_names.format(r"\['f'\]"),
),
(
["A", "B", "f"],
{"names": ["A", "B", "C", "D"]},
None,
_msg_validate_usecols_names.format(r"\['f'\]"),
),
],
)
def test_raises_on_usecols_names_mismatch(all_parsers, usecols, kwargs, expected, msg):
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
kwargs.update(usecols=usecols)
parser = all_parsers
if expected is None:
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
else:
result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]])
def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols):
data = "a,b,c,d\n1,2,3,4\n5,6,7,8"
names = ["A", "B", "C", "D"]
parser = all_parsers
result = parser.read_csv(StringIO(data), header=0, names=names, usecols=usecols)
expected = DataFrame({"A": [1, 5], "C": [3, 7]})
tm.assert_frame_equal(result, expected)
@skip_pyarrow
@pytest.mark.parametrize("names", [None, ["a", "b"]])
def test_usecols_indices_out_of_bounds(all_parsers, names):
# GH#25623 & GH 41130; enforced in 2.0
parser = all_parsers
data = """
a,b
1,2
"""
with pytest.raises(ParserError, match="Defining usecols without of bounds"):
parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0)
@skip_pyarrow
def test_usecols_additional_columns(all_parsers):
# GH#46997
parser = all_parsers
usecols = lambda header: header.strip() in ["a", "b", "c"]
result = parser.read_csv(StringIO("a,b\nx,y,z"), index_col=False, usecols=usecols)
expected = DataFrame({"a": ["x"], "b": "y"})
tm.assert_frame_equal(result, expected)
@skip_pyarrow
def test_usecols_additional_columns_integer_columns(all_parsers):
# GH#46997
parser = all_parsers
usecols = lambda header: header.strip() in ["0", "1"]
result = parser.read_csv(StringIO("0,1\nx,y,z"), index_col=False, usecols=usecols)
expected = DataFrame({"0": ["x"], "1": "y"})
tm.assert_frame_equal(result, expected)
def test_usecols_dtype(all_parsers):
parser = all_parsers
data = """
col1,col2,col3
a,1,x
b,2,y
"""
result = parser.read_csv(
StringIO(data),
usecols=["col1", "col2"],
dtype={"col1": "string", "col2": "uint8", "col3": "string"},
)
expected = DataFrame(
{"col1": array(["a", "b"]), "col2": np.array([1, 2], dtype="uint8")}
)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,50 @@
from collections.abc import Generator
from contextlib import contextmanager
import pathlib
import tempfile
import pytest
from pandas.io.pytables import HDFStore
tables = pytest.importorskip("tables")
# set these parameters so we don't have file sharing
tables.parameters.MAX_NUMEXPR_THREADS = 1
tables.parameters.MAX_BLOSC_THREADS = 1
tables.parameters.MAX_THREADS = 1
def safe_close(store):
try:
if store is not None:
store.close()
except OSError:
pass
# contextmanager to ensure the file cleanup
@contextmanager
def ensure_clean_store(
path, mode="a", complevel=None, complib=None, fletcher32=False
) -> Generator[HDFStore, None, None]:
with tempfile.TemporaryDirectory() as tmpdirname:
tmp_path = pathlib.Path(tmpdirname, path)
with HDFStore(
tmp_path,
mode=mode,
complevel=complevel,
complib=complib,
fletcher32=fletcher32,
) as store:
yield store
def _maybe_remove(store, key):
"""
For tests using tables, try removing the table to be sure there is
no content from previous tests using the same table name.
"""
try:
store.remove(key)
except (ValueError, KeyError):
pass

View File

@@ -0,0 +1,9 @@
import uuid
import pytest
@pytest.fixture
def setup_path():
"""Fixture for setup path"""
return f"tmp.__{uuid.uuid4()}__.h5"

View File

@@ -0,0 +1,924 @@
import datetime
from datetime import timedelta
import re
import numpy as np
import pytest
from pandas._libs.tslibs import Timestamp
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
DataFrame,
Series,
_testing as tm,
concat,
date_range,
read_hdf,
)
from pandas.tests.io.pytables.common import (
_maybe_remove,
ensure_clean_store,
)
pytestmark = pytest.mark.single_cpu
tables = pytest.importorskip("tables")
@pytest.mark.filterwarnings("ignore::tables.NaturalNameWarning")
def test_append(setup_path):
with ensure_clean_store(setup_path) as store:
# this is allowed by almost always don't want to do it
# tables.NaturalNameWarning):
df = tm.makeTimeDataFrame()
_maybe_remove(store, "df1")
store.append("df1", df[:10])
store.append("df1", df[10:])
tm.assert_frame_equal(store["df1"], df)
_maybe_remove(store, "df2")
store.put("df2", df[:10], format="table")
store.append("df2", df[10:])
tm.assert_frame_equal(store["df2"], df)
_maybe_remove(store, "df3")
store.append("/df3", df[:10])
store.append("/df3", df[10:])
tm.assert_frame_equal(store["df3"], df)
# this is allowed by almost always don't want to do it
# tables.NaturalNameWarning
_maybe_remove(store, "/df3 foo")
store.append("/df3 foo", df[:10])
store.append("/df3 foo", df[10:])
tm.assert_frame_equal(store["df3 foo"], df)
# dtype issues - mizxed type in a single object column
df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]])
df["mixed_column"] = "testing"
df.loc[2, "mixed_column"] = np.nan
_maybe_remove(store, "df")
store.append("df", df)
tm.assert_frame_equal(store["df"], df)
# uints - test storage of uints
uint_data = DataFrame(
{
"u08": Series(
np.random.default_rng(2).integers(0, high=255, size=5),
dtype=np.uint8,
),
"u16": Series(
np.random.default_rng(2).integers(0, high=65535, size=5),
dtype=np.uint16,
),
"u32": Series(
np.random.default_rng(2).integers(0, high=2**30, size=5),
dtype=np.uint32,
),
"u64": Series(
[2**58, 2**59, 2**60, 2**61, 2**62],
dtype=np.uint64,
),
},
index=np.arange(5),
)
_maybe_remove(store, "uints")
store.append("uints", uint_data)
tm.assert_frame_equal(store["uints"], uint_data, check_index_type=True)
# uints - test storage of uints in indexable columns
_maybe_remove(store, "uints")
# 64-bit indices not yet supported
store.append("uints", uint_data, data_columns=["u08", "u16", "u32"])
tm.assert_frame_equal(store["uints"], uint_data, check_index_type=True)
def test_append_series(setup_path):
with ensure_clean_store(setup_path) as store:
# basic
ss = tm.makeStringSeries()
ts = tm.makeTimeSeries()
ns = Series(np.arange(100))
store.append("ss", ss)
result = store["ss"]
tm.assert_series_equal(result, ss)
assert result.name is None
store.append("ts", ts)
result = store["ts"]
tm.assert_series_equal(result, ts)
assert result.name is None
ns.name = "foo"
store.append("ns", ns)
result = store["ns"]
tm.assert_series_equal(result, ns)
assert result.name == ns.name
# select on the values
expected = ns[ns > 60]
result = store.select("ns", "foo>60")
tm.assert_series_equal(result, expected)
# select on the index and values
expected = ns[(ns > 70) & (ns.index < 90)]
result = store.select("ns", "foo>70 and index<90")
tm.assert_series_equal(result, expected, check_index_type=True)
# multi-index
mi = DataFrame(np.random.default_rng(2).standard_normal((5, 1)), columns=["A"])
mi["B"] = np.arange(len(mi))
mi["C"] = "foo"
mi.loc[3:5, "C"] = "bar"
mi.set_index(["C", "B"], inplace=True)
s = mi.stack(future_stack=True)
s.index = s.index.droplevel(2)
store.append("mi", s)
tm.assert_series_equal(store["mi"], s, check_index_type=True)
def test_append_some_nans(setup_path):
with ensure_clean_store(setup_path) as store:
df = DataFrame(
{
"A": Series(np.random.default_rng(2).standard_normal(20)).astype(
"int32"
),
"A1": np.random.default_rng(2).standard_normal(20),
"A2": np.random.default_rng(2).standard_normal(20),
"B": "foo",
"C": "bar",
"D": Timestamp("2001-01-01").as_unit("ns"),
"E": Timestamp("2001-01-02").as_unit("ns"),
},
index=np.arange(20),
)
# some nans
_maybe_remove(store, "df1")
df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan
store.append("df1", df[:10])
store.append("df1", df[10:])
tm.assert_frame_equal(store["df1"], df, check_index_type=True)
# first column
df1 = df.copy()
df1["A1"] = np.nan
_maybe_remove(store, "df1")
store.append("df1", df1[:10])
store.append("df1", df1[10:])
tm.assert_frame_equal(store["df1"], df1, check_index_type=True)
# 2nd column
df2 = df.copy()
df2["A2"] = np.nan
_maybe_remove(store, "df2")
store.append("df2", df2[:10])
store.append("df2", df2[10:])
tm.assert_frame_equal(store["df2"], df2, check_index_type=True)
# datetimes
df3 = df.copy()
df3["E"] = np.nan
_maybe_remove(store, "df3")
store.append("df3", df3[:10])
store.append("df3", df3[10:])
tm.assert_frame_equal(store["df3"], df3, check_index_type=True)
def test_append_all_nans(setup_path):
with ensure_clean_store(setup_path) as store:
df = DataFrame(
{
"A1": np.random.default_rng(2).standard_normal(20),
"A2": np.random.default_rng(2).standard_normal(20),
},
index=np.arange(20),
)
df.loc[0:15, :] = np.nan
# nan some entire rows (dropna=True)
_maybe_remove(store, "df")
store.append("df", df[:10], dropna=True)
store.append("df", df[10:], dropna=True)
tm.assert_frame_equal(store["df"], df[-4:], check_index_type=True)
# nan some entire rows (dropna=False)
_maybe_remove(store, "df2")
store.append("df2", df[:10], dropna=False)
store.append("df2", df[10:], dropna=False)
tm.assert_frame_equal(store["df2"], df, check_index_type=True)
# tests the option io.hdf.dropna_table
with pd.option_context("io.hdf.dropna_table", False):
_maybe_remove(store, "df3")
store.append("df3", df[:10])
store.append("df3", df[10:])
tm.assert_frame_equal(store["df3"], df)
with pd.option_context("io.hdf.dropna_table", True):
_maybe_remove(store, "df4")
store.append("df4", df[:10])
store.append("df4", df[10:])
tm.assert_frame_equal(store["df4"], df[-4:])
# nan some entire rows (string are still written!)
df = DataFrame(
{
"A1": np.random.default_rng(2).standard_normal(20),
"A2": np.random.default_rng(2).standard_normal(20),
"B": "foo",
"C": "bar",
},
index=np.arange(20),
)
df.loc[0:15, :] = np.nan
_maybe_remove(store, "df")
store.append("df", df[:10], dropna=True)
store.append("df", df[10:], dropna=True)
tm.assert_frame_equal(store["df"], df, check_index_type=True)
_maybe_remove(store, "df2")
store.append("df2", df[:10], dropna=False)
store.append("df2", df[10:], dropna=False)
tm.assert_frame_equal(store["df2"], df, check_index_type=True)
# nan some entire rows (but since we have dates they are still
# written!)
df = DataFrame(
{
"A1": np.random.default_rng(2).standard_normal(20),
"A2": np.random.default_rng(2).standard_normal(20),
"B": "foo",
"C": "bar",
"D": Timestamp("2001-01-01").as_unit("ns"),
"E": Timestamp("2001-01-02").as_unit("ns"),
},
index=np.arange(20),
)
df.loc[0:15, :] = np.nan
_maybe_remove(store, "df")
store.append("df", df[:10], dropna=True)
store.append("df", df[10:], dropna=True)
tm.assert_frame_equal(store["df"], df, check_index_type=True)
_maybe_remove(store, "df2")
store.append("df2", df[:10], dropna=False)
store.append("df2", df[10:], dropna=False)
tm.assert_frame_equal(store["df2"], df, check_index_type=True)
def test_append_frame_column_oriented(setup_path):
with ensure_clean_store(setup_path) as store:
# column oriented
df = tm.makeTimeDataFrame()
df.index = df.index._with_freq(None) # freq doesn't round-trip
_maybe_remove(store, "df1")
store.append("df1", df.iloc[:, :2], axes=["columns"])
store.append("df1", df.iloc[:, 2:])
tm.assert_frame_equal(store["df1"], df)
result = store.select("df1", "columns=A")
expected = df.reindex(columns=["A"])
tm.assert_frame_equal(expected, result)
# selection on the non-indexable
result = store.select("df1", ("columns=A", "index=df.index[0:4]"))
expected = df.reindex(columns=["A"], index=df.index[0:4])
tm.assert_frame_equal(expected, result)
# this isn't supported
msg = re.escape(
"passing a filterable condition to a non-table indexer "
"[Filter: Not Initialized]"
)
with pytest.raises(TypeError, match=msg):
store.select("df1", "columns=A and index>df.index[4]")
def test_append_with_different_block_ordering(setup_path):
# GH 4096; using same frames, but different block orderings
with ensure_clean_store(setup_path) as store:
for i in range(10):
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB")
)
df["index"] = range(10)
df["index"] += i * 10
df["int64"] = Series([1] * len(df), dtype="int64")
df["int16"] = Series([1] * len(df), dtype="int16")
if i % 2 == 0:
del df["int64"]
df["int64"] = Series([1] * len(df), dtype="int64")
if i % 3 == 0:
a = df.pop("A")
df["A"] = a
df.set_index("index", inplace=True)
store.append("df", df)
# test a different ordering but with more fields (like invalid
# combinations)
with ensure_clean_store(setup_path) as store:
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 2)),
columns=list("AB"),
dtype="float64",
)
df["int64"] = Series([1] * len(df), dtype="int64")
df["int16"] = Series([1] * len(df), dtype="int16")
store.append("df", df)
# store additional fields in different blocks
df["int16_2"] = Series([1] * len(df), dtype="int16")
msg = re.escape(
"cannot match existing table structure for [int16] on appending data"
)
with pytest.raises(ValueError, match=msg):
store.append("df", df)
# store multiple additional fields in different blocks
df["float_3"] = Series([1.0] * len(df), dtype="float64")
msg = re.escape(
"cannot match existing table structure for [A,B] on appending data"
)
with pytest.raises(ValueError, match=msg):
store.append("df", df)
def test_append_with_strings(setup_path):
with ensure_clean_store(setup_path) as store:
def check_col(key, name, size):
assert (
getattr(store.get_storer(key).table.description, name).itemsize == size
)
# avoid truncation on elements
df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]])
store.append("df_big", df)
tm.assert_frame_equal(store.select("df_big"), df)
check_col("df_big", "values_block_1", 15)
# appending smaller string ok
df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]])
store.append("df_big", df2)
expected = concat([df, df2])
tm.assert_frame_equal(store.select("df_big"), expected)
check_col("df_big", "values_block_1", 15)
# avoid truncation on elements
df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]])
store.append("df_big2", df, min_itemsize={"values": 50})
tm.assert_frame_equal(store.select("df_big2"), df)
check_col("df_big2", "values_block_1", 50)
# bigger string on next append
store.append("df_new", df)
df_new = DataFrame([[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]])
msg = (
r"Trying to store a string with len \[26\] in "
r"\[values_block_1\] column but\n"
r"this column has a limit of \[15\]!\n"
"Consider using min_itemsize to preset the sizes on these "
"columns"
)
with pytest.raises(ValueError, match=msg):
store.append("df_new", df_new)
# min_itemsize on Series index (GH 11412)
df = tm.makeMixedDataFrame().set_index("C")
store.append("ss", df["B"], min_itemsize={"index": 4})
tm.assert_series_equal(store.select("ss"), df["B"])
# same as above, with data_columns=True
store.append("ss2", df["B"], data_columns=True, min_itemsize={"index": 4})
tm.assert_series_equal(store.select("ss2"), df["B"])
# min_itemsize in index without appending (GH 10381)
store.put("ss3", df, format="table", min_itemsize={"index": 6})
# just make sure there is a longer string:
df2 = df.copy().reset_index().assign(C="longer").set_index("C")
store.append("ss3", df2)
tm.assert_frame_equal(store.select("ss3"), concat([df, df2]))
# same as above, with a Series
store.put("ss4", df["B"], format="table", min_itemsize={"index": 6})
store.append("ss4", df2["B"])
tm.assert_series_equal(store.select("ss4"), concat([df["B"], df2["B"]]))
# with nans
_maybe_remove(store, "df")
df = tm.makeTimeDataFrame()
df["string"] = "foo"
df.loc[df.index[1:4], "string"] = np.nan
df["string2"] = "bar"
df.loc[df.index[4:8], "string2"] = np.nan
df["string3"] = "bah"
df.loc[df.index[1:], "string3"] = np.nan
store.append("df", df)
result = store.select("df")
tm.assert_frame_equal(result, df)
with ensure_clean_store(setup_path) as store:
df = DataFrame({"A": "foo", "B": "bar"}, index=range(10))
# a min_itemsize that creates a data_column
_maybe_remove(store, "df")
store.append("df", df, min_itemsize={"A": 200})
check_col("df", "A", 200)
assert store.get_storer("df").data_columns == ["A"]
# a min_itemsize that creates a data_column2
_maybe_remove(store, "df")
store.append("df", df, data_columns=["B"], min_itemsize={"A": 200})
check_col("df", "A", 200)
assert store.get_storer("df").data_columns == ["B", "A"]
# a min_itemsize that creates a data_column2
_maybe_remove(store, "df")
store.append("df", df, data_columns=["B"], min_itemsize={"values": 200})
check_col("df", "B", 200)
check_col("df", "values_block_0", 200)
assert store.get_storer("df").data_columns == ["B"]
# infer the .typ on subsequent appends
_maybe_remove(store, "df")
store.append("df", df[:5], min_itemsize=200)
store.append("df", df[5:], min_itemsize=200)
tm.assert_frame_equal(store["df"], df)
# invalid min_itemsize keys
df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"])
_maybe_remove(store, "df")
msg = re.escape(
"min_itemsize has the key [foo] which is not an axis or data_column"
)
with pytest.raises(ValueError, match=msg):
store.append("df", df, min_itemsize={"foo": 20, "foobar": 20})
def test_append_with_empty_string(setup_path):
with ensure_clean_store(setup_path) as store:
# with all empty strings (GH 12242)
df = DataFrame({"x": ["a", "b", "c", "d", "e", "f", ""]})
store.append("df", df[:-1], min_itemsize={"x": 1})
store.append("df", df[-1:], min_itemsize={"x": 1})
tm.assert_frame_equal(store.select("df"), df)
def test_append_with_data_columns(setup_path):
with ensure_clean_store(setup_path) as store:
df = tm.makeTimeDataFrame()
df.iloc[0, df.columns.get_loc("B")] = 1.0
_maybe_remove(store, "df")
store.append("df", df[:2], data_columns=["B"])
store.append("df", df[2:])
tm.assert_frame_equal(store["df"], df)
# check that we have indices created
assert store._handle.root.df.table.cols.index.is_indexed is True
assert store._handle.root.df.table.cols.B.is_indexed is True
# data column searching
result = store.select("df", "B>0")
expected = df[df.B > 0]
tm.assert_frame_equal(result, expected)
# data column searching (with an indexable and a data_columns)
result = store.select("df", "B>0 and index>df.index[3]")
df_new = df.reindex(index=df.index[4:])
expected = df_new[df_new.B > 0]
tm.assert_frame_equal(result, expected)
# data column selection with a string data_column
df_new = df.copy()
df_new["string"] = "foo"
df_new.loc[df_new.index[1:4], "string"] = np.nan
df_new.loc[df_new.index[5:6], "string"] = "bar"
_maybe_remove(store, "df")
store.append("df", df_new, data_columns=["string"])
result = store.select("df", "string='foo'")
expected = df_new[df_new.string == "foo"]
tm.assert_frame_equal(result, expected)
# using min_itemsize and a data column
def check_col(key, name, size):
assert (
getattr(store.get_storer(key).table.description, name).itemsize == size
)
with ensure_clean_store(setup_path) as store:
_maybe_remove(store, "df")
store.append("df", df_new, data_columns=["string"], min_itemsize={"string": 30})
check_col("df", "string", 30)
_maybe_remove(store, "df")
store.append("df", df_new, data_columns=["string"], min_itemsize=30)
check_col("df", "string", 30)
_maybe_remove(store, "df")
store.append("df", df_new, data_columns=["string"], min_itemsize={"values": 30})
check_col("df", "string", 30)
with ensure_clean_store(setup_path) as store:
df_new["string2"] = "foobarbah"
df_new["string_block1"] = "foobarbah1"
df_new["string_block2"] = "foobarbah2"
_maybe_remove(store, "df")
store.append(
"df",
df_new,
data_columns=["string", "string2"],
min_itemsize={"string": 30, "string2": 40, "values": 50},
)
check_col("df", "string", 30)
check_col("df", "string2", 40)
check_col("df", "values_block_1", 50)
with ensure_clean_store(setup_path) as store:
# multiple data columns
df_new = df.copy()
df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0
df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0
df_new["string"] = "foo"
sl = df_new.columns.get_loc("string")
df_new.iloc[1:4, sl] = np.nan
df_new.iloc[5:6, sl] = "bar"
df_new["string2"] = "foo"
sl = df_new.columns.get_loc("string2")
df_new.iloc[2:5, sl] = np.nan
df_new.iloc[7:8, sl] = "bar"
_maybe_remove(store, "df")
store.append("df", df_new, data_columns=["A", "B", "string", "string2"])
result = store.select("df", "string='foo' and string2='foo' and A>0 and B<0")
expected = df_new[
(df_new.string == "foo")
& (df_new.string2 == "foo")
& (df_new.A > 0)
& (df_new.B < 0)
]
tm.assert_frame_equal(result, expected, check_freq=False)
# FIXME: 2020-05-07 freq check randomly fails in the CI
# yield an empty frame
result = store.select("df", "string='foo' and string2='cool'")
expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")]
tm.assert_frame_equal(result, expected)
with ensure_clean_store(setup_path) as store:
# doc example
df_dc = df.copy()
df_dc["string"] = "foo"
df_dc.loc[df_dc.index[4:6], "string"] = np.nan
df_dc.loc[df_dc.index[7:9], "string"] = "bar"
df_dc["string2"] = "cool"
df_dc["datetime"] = Timestamp("20010102").as_unit("ns")
df_dc.loc[df_dc.index[3:5], ["A", "B", "datetime"]] = np.nan
_maybe_remove(store, "df_dc")
store.append(
"df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"]
)
result = store.select("df_dc", "B>0")
expected = df_dc[df_dc.B > 0]
tm.assert_frame_equal(result, expected)
result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"])
expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")]
tm.assert_frame_equal(result, expected, check_freq=False)
# FIXME: 2020-12-07 intermittent build failures here with freq of
# None instead of BDay(4)
with ensure_clean_store(setup_path) as store:
# doc example part 2
index = date_range("1/1/2000", periods=8)
df_dc = DataFrame(
np.random.default_rng(2).standard_normal((8, 3)),
index=index,
columns=["A", "B", "C"],
)
df_dc["string"] = "foo"
df_dc.loc[df_dc.index[4:6], "string"] = np.nan
df_dc.loc[df_dc.index[7:9], "string"] = "bar"
df_dc[["B", "C"]] = df_dc[["B", "C"]].abs()
df_dc["string2"] = "cool"
# on-disk operations
store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"])
result = store.select("df_dc", "B>0")
expected = df_dc[df_dc.B > 0]
tm.assert_frame_equal(result, expected)
result = store.select("df_dc", ["B > 0", "C > 0", 'string == "foo"'])
expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")]
tm.assert_frame_equal(result, expected)
def test_append_hierarchical(tmp_path, setup_path, multiindex_dataframe_random_data):
df = multiindex_dataframe_random_data
df.columns.name = None
with ensure_clean_store(setup_path) as store:
store.append("mi", df)
result = store.select("mi")
tm.assert_frame_equal(result, df)
# GH 3748
result = store.select("mi", columns=["A", "B"])
expected = df.reindex(columns=["A", "B"])
tm.assert_frame_equal(result, expected)
path = tmp_path / "test.hdf"
df.to_hdf(path, "df", format="table")
result = read_hdf(path, "df", columns=["A", "B"])
expected = df.reindex(columns=["A", "B"])
tm.assert_frame_equal(result, expected)
def test_append_misc(setup_path):
with ensure_clean_store(setup_path) as store:
df = tm.makeDataFrame()
store.append("df", df, chunksize=1)
result = store.select("df")
tm.assert_frame_equal(result, df)
store.append("df1", df, expectedrows=10)
result = store.select("df1")
tm.assert_frame_equal(result, df)
@pytest.mark.parametrize("chunksize", [10, 200, 1000])
def test_append_misc_chunksize(setup_path, chunksize):
# more chunksize in append tests
df = tm.makeDataFrame()
df["string"] = "foo"
df["float322"] = 1.0
df["float322"] = df["float322"].astype("float32")
df["bool"] = df["float322"] > 0
df["time1"] = Timestamp("20130101").as_unit("ns")
df["time2"] = Timestamp("20130102").as_unit("ns")
with ensure_clean_store(setup_path, mode="w") as store:
store.append("obj", df, chunksize=chunksize)
result = store.select("obj")
tm.assert_frame_equal(result, df)
def test_append_misc_empty_frame(setup_path):
# empty frame, GH4273
with ensure_clean_store(setup_path) as store:
# 0 len
df_empty = DataFrame(columns=list("ABC"))
store.append("df", df_empty)
with pytest.raises(KeyError, match="'No object named df in the file'"):
store.select("df")
# repeated append of 0/non-zero frames
df = DataFrame(np.random.default_rng(2).random((10, 3)), columns=list("ABC"))
store.append("df", df)
tm.assert_frame_equal(store.select("df"), df)
store.append("df", df_empty)
tm.assert_frame_equal(store.select("df"), df)
# store
df = DataFrame(columns=list("ABC"))
store.put("df2", df)
tm.assert_frame_equal(store.select("df2"), df)
# TODO(ArrayManager) currently we rely on falling back to BlockManager, but
# the conversion from AM->BM converts the invalid object dtype column into
# a datetime64 column no longer raising an error
@td.skip_array_manager_not_yet_implemented
def test_append_raise(setup_path):
with ensure_clean_store(setup_path) as store:
# test append with invalid input to get good error messages
# list in column
df = tm.makeDataFrame()
df["invalid"] = [["a"]] * len(df)
assert df.dtypes["invalid"] == np.object_
msg = re.escape(
"""Cannot serialize the column [invalid]
because its data contents are not [string] but [mixed] object dtype"""
)
with pytest.raises(TypeError, match=msg):
store.append("df", df)
# multiple invalid columns
df["invalid2"] = [["a"]] * len(df)
df["invalid3"] = [["a"]] * len(df)
with pytest.raises(TypeError, match=msg):
store.append("df", df)
# datetime with embedded nans as object
df = tm.makeDataFrame()
s = Series(datetime.datetime(2001, 1, 2), index=df.index)
s = s.astype(object)
s[0:5] = np.nan
df["invalid"] = s
assert df.dtypes["invalid"] == np.object_
msg = "too many timezones in this block, create separate data columns"
with pytest.raises(TypeError, match=msg):
store.append("df", df)
# directly ndarray
msg = "value must be None, Series, or DataFrame"
with pytest.raises(TypeError, match=msg):
store.append("df", np.arange(10))
# series directly
msg = re.escape(
"cannot properly create the storer for: "
"[group->df,value-><class 'pandas.core.series.Series'>]"
)
with pytest.raises(TypeError, match=msg):
store.append("df", Series(np.arange(10)))
# appending an incompatible table
df = tm.makeDataFrame()
store.append("df", df)
df["foo"] = "foo"
msg = re.escape(
"invalid combination of [non_index_axes] on appending data "
"[(1, ['A', 'B', 'C', 'D', 'foo'])] vs current table "
"[(1, ['A', 'B', 'C', 'D'])]"
)
with pytest.raises(ValueError, match=msg):
store.append("df", df)
# incompatible type (GH 41897)
_maybe_remove(store, "df")
df["foo"] = Timestamp("20130101")
store.append("df", df)
df["foo"] = "bar"
msg = re.escape(
"invalid combination of [values_axes] on appending data "
"[name->values_block_1,cname->values_block_1,"
"dtype->bytes24,kind->string,shape->(1, 30)] "
"vs current table "
"[name->values_block_1,cname->values_block_1,"
"dtype->datetime64,kind->datetime64,shape->None]"
)
with pytest.raises(ValueError, match=msg):
store.append("df", df)
def test_append_with_timedelta(setup_path):
# GH 3577
# append timedelta
ts = Timestamp("20130101").as_unit("ns")
df = DataFrame(
{
"A": ts,
"B": [ts + timedelta(days=i, seconds=10) for i in range(10)],
}
)
df["C"] = df["A"] - df["B"]
df.loc[3:5, "C"] = np.nan
with ensure_clean_store(setup_path) as store:
# table
_maybe_remove(store, "df")
store.append("df", df, data_columns=True)
result = store.select("df")
tm.assert_frame_equal(result, df)
result = store.select("df", where="C<100000")
tm.assert_frame_equal(result, df)
result = store.select("df", where="C<pd.Timedelta('-3D')")
tm.assert_frame_equal(result, df.iloc[3:])
result = store.select("df", "C<'-3D'")
tm.assert_frame_equal(result, df.iloc[3:])
# a bit hacky here as we don't really deal with the NaT properly
result = store.select("df", "C<'-500000s'")
result = result.dropna(subset=["C"])
tm.assert_frame_equal(result, df.iloc[6:])
result = store.select("df", "C<'-3.5D'")
result = result.iloc[1:]
tm.assert_frame_equal(result, df.iloc[4:])
# fixed
_maybe_remove(store, "df2")
store.put("df2", df)
result = store.select("df2")
tm.assert_frame_equal(result, df)
def test_append_to_multiple(setup_path):
df1 = tm.makeTimeDataFrame()
df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
df2["foo"] = "bar"
df = concat([df1, df2], axis=1)
with ensure_clean_store(setup_path) as store:
# exceptions
msg = "append_to_multiple requires a selector that is in passed dict"
with pytest.raises(ValueError, match=msg):
store.append_to_multiple(
{"df1": ["A", "B"], "df2": None}, df, selector="df3"
)
with pytest.raises(ValueError, match=msg):
store.append_to_multiple({"df1": None, "df2": None}, df, selector="df3")
msg = (
"append_to_multiple must have a dictionary specified as the way to "
"split the value"
)
with pytest.raises(ValueError, match=msg):
store.append_to_multiple("df1", df, "df1")
# regular operation
store.append_to_multiple({"df1": ["A", "B"], "df2": None}, df, selector="df1")
result = store.select_as_multiple(
["df1", "df2"], where=["A>0", "B>0"], selector="df1"
)
expected = df[(df.A > 0) & (df.B > 0)]
tm.assert_frame_equal(result, expected)
def test_append_to_multiple_dropna(setup_path):
df1 = tm.makeTimeDataFrame()
df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
df = concat([df1, df2], axis=1)
with ensure_clean_store(setup_path) as store:
# dropna=True should guarantee rows are synchronized
store.append_to_multiple(
{"df1": ["A", "B"], "df2": None}, df, selector="df1", dropna=True
)
result = store.select_as_multiple(["df1", "df2"])
expected = df.dropna()
tm.assert_frame_equal(result, expected, check_index_type=True)
tm.assert_index_equal(store.select("df1").index, store.select("df2").index)
def test_append_to_multiple_dropna_false(setup_path):
df1 = tm.makeTimeDataFrame()
df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
df = concat([df1, df2], axis=1)
with ensure_clean_store(setup_path) as store, pd.option_context(
"io.hdf.dropna_table", True
):
# dropna=False shouldn't synchronize row indexes
store.append_to_multiple(
{"df1a": ["A", "B"], "df2a": None}, df, selector="df1a", dropna=False
)
msg = "all tables must have exactly the same nrows!"
with pytest.raises(ValueError, match=msg):
store.select_as_multiple(["df1a", "df2a"])
assert not store.select("df1a").index.equals(store.select("df2a").index)
def test_append_to_multiple_min_itemsize(setup_path):
# GH 11238
df = DataFrame(
{
"IX": np.arange(1, 21),
"Num": np.arange(1, 21),
"BigNum": np.arange(1, 21) * 88,
"Str": ["a" for _ in range(20)],
"LongStr": ["abcde" for _ in range(20)],
}
)
expected = df.iloc[[0]]
with ensure_clean_store(setup_path) as store:
store.append_to_multiple(
{
"index": ["IX"],
"nums": ["Num", "BigNum"],
"strs": ["Str", "LongStr"],
},
df.iloc[[0]],
"index",
min_itemsize={"Str": 10, "LongStr": 100, "Num": 2},
)
result = store.select_as_multiple(["index", "nums", "strs"])
tm.assert_frame_equal(result, expected, check_index_type=True)

View File

@@ -0,0 +1,214 @@
import numpy as np
import pytest
from pandas import (
Categorical,
DataFrame,
Series,
_testing as tm,
concat,
read_hdf,
)
from pandas.tests.io.pytables.common import (
_maybe_remove,
ensure_clean_store,
)
pytestmark = pytest.mark.single_cpu
def test_categorical(setup_path):
with ensure_clean_store(setup_path) as store:
# Basic
_maybe_remove(store, "s")
s = Series(
Categorical(
["a", "b", "b", "a", "a", "c"],
categories=["a", "b", "c", "d"],
ordered=False,
)
)
store.append("s", s, format="table")
result = store.select("s")
tm.assert_series_equal(s, result)
_maybe_remove(store, "s_ordered")
s = Series(
Categorical(
["a", "b", "b", "a", "a", "c"],
categories=["a", "b", "c", "d"],
ordered=True,
)
)
store.append("s_ordered", s, format="table")
result = store.select("s_ordered")
tm.assert_series_equal(s, result)
_maybe_remove(store, "df")
df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]})
store.append("df", df, format="table")
result = store.select("df")
tm.assert_frame_equal(result, df)
# Dtypes
_maybe_remove(store, "si")
s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category")
store.append("si", s)
result = store.select("si")
tm.assert_series_equal(result, s)
_maybe_remove(store, "si2")
s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category")
store.append("si2", s)
result = store.select("si2")
tm.assert_series_equal(result, s)
# Multiple
_maybe_remove(store, "df2")
df2 = df.copy()
df2["s2"] = Series(list("abcdefg")).astype("category")
store.append("df2", df2)
result = store.select("df2")
tm.assert_frame_equal(result, df2)
# Make sure the metadata is OK
info = store.info()
assert "/df2 " in info
# df2._mgr.blocks[0] and df2._mgr.blocks[2] are Categorical
assert "/df2/meta/values_block_0/meta" in info
assert "/df2/meta/values_block_2/meta" in info
# unordered
_maybe_remove(store, "s2")
s = Series(
Categorical(
["a", "b", "b", "a", "a", "c"],
categories=["a", "b", "c", "d"],
ordered=False,
)
)
store.append("s2", s, format="table")
result = store.select("s2")
tm.assert_series_equal(result, s)
# Query
_maybe_remove(store, "df3")
store.append("df3", df, data_columns=["s"])
expected = df[df.s.isin(["b", "c"])]
result = store.select("df3", where=['s in ["b","c"]'])
tm.assert_frame_equal(result, expected)
expected = df[df.s.isin(["b", "c"])]
result = store.select("df3", where=['s = ["b","c"]'])
tm.assert_frame_equal(result, expected)
expected = df[df.s.isin(["d"])]
result = store.select("df3", where=['s in ["d"]'])
tm.assert_frame_equal(result, expected)
expected = df[df.s.isin(["f"])]
result = store.select("df3", where=['s in ["f"]'])
tm.assert_frame_equal(result, expected)
# Appending with same categories is ok
store.append("df3", df)
df = concat([df, df])
expected = df[df.s.isin(["b", "c"])]
result = store.select("df3", where=['s in ["b","c"]'])
tm.assert_frame_equal(result, expected)
# Appending must have the same categories
df3 = df.copy()
df3["s"] = df3["s"].cat.remove_unused_categories()
msg = "cannot append a categorical with different categories to the existing"
with pytest.raises(ValueError, match=msg):
store.append("df3", df3)
# Remove, and make sure meta data is removed (its a recursive
# removal so should be).
result = store.select("df3/meta/s/meta")
assert result is not None
store.remove("df3")
with pytest.raises(
KeyError, match="'No object named df3/meta/s/meta in the file'"
):
store.select("df3/meta/s/meta")
def test_categorical_conversion(tmp_path, setup_path):
# GH13322
# Check that read_hdf with categorical columns doesn't return rows if
# where criteria isn't met.
obsids = ["ESP_012345_6789", "ESP_987654_3210"]
imgids = ["APF00006np", "APF0001imm"]
data = [4.3, 9.8]
# Test without categories
df = DataFrame({"obsids": obsids, "imgids": imgids, "data": data})
# We are expecting an empty DataFrame matching types of df
expected = df.iloc[[], :]
path = tmp_path / setup_path
df.to_hdf(path, "df", format="table", data_columns=True)
result = read_hdf(path, "df", where="obsids=B")
tm.assert_frame_equal(result, expected)
# Test with categories
df.obsids = df.obsids.astype("category")
df.imgids = df.imgids.astype("category")
# We are expecting an empty DataFrame matching types of df
expected = df.iloc[[], :]
path = tmp_path / setup_path
df.to_hdf(path, "df", format="table", data_columns=True)
result = read_hdf(path, "df", where="obsids=B")
tm.assert_frame_equal(result, expected)
def test_categorical_nan_only_columns(tmp_path, setup_path):
# GH18413
# Check that read_hdf with categorical columns with NaN-only values can
# be read back.
df = DataFrame(
{
"a": ["a", "b", "c", np.nan],
"b": [np.nan, np.nan, np.nan, np.nan],
"c": [1, 2, 3, 4],
"d": Series([None] * 4, dtype=object),
}
)
df["a"] = df.a.astype("category")
df["b"] = df.b.astype("category")
df["d"] = df.b.astype("category")
expected = df
path = tmp_path / setup_path
df.to_hdf(path, "df", format="table", data_columns=True)
result = read_hdf(path, "df")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"where, df, expected",
[
('col=="q"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": []})),
('col=="a"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": ["a"]})),
],
)
def test_convert_value(
tmp_path, setup_path, where: str, df: DataFrame, expected: DataFrame
):
# GH39420
# Check that read_hdf with categorical columns can filter by where condition.
df.col = df.col.astype("category")
max_widths = {"col": 1}
categorical_values = sorted(df.col.unique())
expected.col = expected.col.astype("category")
expected.col = expected.col.cat.set_categories(categorical_values)
path = tmp_path / setup_path
df.to_hdf(path, "df", format="table", min_itemsize=max_widths)
result = read_hdf(path, where=where)
tm.assert_frame_equal(result, expected)

View File

@@ -0,0 +1,75 @@
import pytest
import pandas as pd
import pandas._testing as tm
tables = pytest.importorskip("tables")
@pytest.fixture
def pytables_hdf5_file(tmp_path):
"""
Use PyTables to create a simple HDF5 file.
"""
table_schema = {
"c0": tables.Time64Col(pos=0),
"c1": tables.StringCol(5, pos=1),
"c2": tables.Int64Col(pos=2),
}
t0 = 1_561_105_000.0
testsamples = [
{"c0": t0, "c1": "aaaaa", "c2": 1},
{"c0": t0 + 1, "c1": "bbbbb", "c2": 2},
{"c0": t0 + 2, "c1": "ccccc", "c2": 10**5},
{"c0": t0 + 3, "c1": "ddddd", "c2": 4_294_967_295},
]
objname = "pandas_test_timeseries"
path = tmp_path / "written_with_pytables.h5"
with tables.open_file(path, mode="w") as f:
t = f.create_table("/", name=objname, description=table_schema)
for sample in testsamples:
for key, value in sample.items():
t.row[key] = value
t.row.append()
yield path, objname, pd.DataFrame(testsamples)
class TestReadPyTablesHDF5:
"""
A group of tests which covers reading HDF5 files written by plain PyTables
(not written by pandas).
Was introduced for regression-testing issue 11188.
"""
def test_read_complete(self, pytables_hdf5_file):
path, objname, df = pytables_hdf5_file
result = pd.read_hdf(path, key=objname)
expected = df
tm.assert_frame_equal(result, expected, check_index_type=True)
def test_read_with_start(self, pytables_hdf5_file):
path, objname, df = pytables_hdf5_file
# This is a regression test for pandas-dev/pandas/issues/11188
result = pd.read_hdf(path, key=objname, start=1)
expected = df[1:].reset_index(drop=True)
tm.assert_frame_equal(result, expected, check_index_type=True)
def test_read_with_stop(self, pytables_hdf5_file):
path, objname, df = pytables_hdf5_file
# This is a regression test for pandas-dev/pandas/issues/11188
result = pd.read_hdf(path, key=objname, stop=1)
expected = df[:1].reset_index(drop=True)
tm.assert_frame_equal(result, expected, check_index_type=True)
def test_read_with_startstop(self, pytables_hdf5_file):
path, objname, df = pytables_hdf5_file
# This is a regression test for pandas-dev/pandas/issues/11188
result = pd.read_hdf(path, key=objname, start=1, stop=2)
expected = df[1:2].reset_index(drop=True)
tm.assert_frame_equal(result, expected, check_index_type=True)

View File

@@ -0,0 +1,195 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
from pandas.tests.io.pytables.common import ensure_clean_store
from pandas.io.pytables import read_hdf
def test_complex_fixed(tmp_path, setup_path):
df = DataFrame(
np.random.default_rng(2).random((4, 5)).astype(np.complex64),
index=list("abcd"),
columns=list("ABCDE"),
)
path = tmp_path / setup_path
df.to_hdf(path, "df")
reread = read_hdf(path, "df")
tm.assert_frame_equal(df, reread)
df = DataFrame(
np.random.default_rng(2).random((4, 5)).astype(np.complex128),
index=list("abcd"),
columns=list("ABCDE"),
)
path = tmp_path / setup_path
df.to_hdf(path, "df")
reread = read_hdf(path, "df")
tm.assert_frame_equal(df, reread)
def test_complex_table(tmp_path, setup_path):
df = DataFrame(
np.random.default_rng(2).random((4, 5)).astype(np.complex64),
index=list("abcd"),
columns=list("ABCDE"),
)
path = tmp_path / setup_path
df.to_hdf(path, "df", format="table")
reread = read_hdf(path, "df")
tm.assert_frame_equal(df, reread)
df = DataFrame(
np.random.default_rng(2).random((4, 5)).astype(np.complex128),
index=list("abcd"),
columns=list("ABCDE"),
)
path = tmp_path / setup_path
df.to_hdf(path, "df", format="table", mode="w")
reread = read_hdf(path, "df")
tm.assert_frame_equal(df, reread)
def test_complex_mixed_fixed(tmp_path, setup_path):
complex64 = np.array(
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64
)
complex128 = np.array(
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128
)
df = DataFrame(
{
"A": [1, 2, 3, 4],
"B": ["a", "b", "c", "d"],
"C": complex64,
"D": complex128,
"E": [1.0, 2.0, 3.0, 4.0],
},
index=list("abcd"),
)
path = tmp_path / setup_path
df.to_hdf(path, "df")
reread = read_hdf(path, "df")
tm.assert_frame_equal(df, reread)
def test_complex_mixed_table(tmp_path, setup_path):
complex64 = np.array(
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64
)
complex128 = np.array(
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128
)
df = DataFrame(
{
"A": [1, 2, 3, 4],
"B": ["a", "b", "c", "d"],
"C": complex64,
"D": complex128,
"E": [1.0, 2.0, 3.0, 4.0],
},
index=list("abcd"),
)
with ensure_clean_store(setup_path) as store:
store.append("df", df, data_columns=["A", "B"])
result = store.select("df", where="A>2")
tm.assert_frame_equal(df.loc[df.A > 2], result)
path = tmp_path / setup_path
df.to_hdf(path, "df", format="table")
reread = read_hdf(path, "df")
tm.assert_frame_equal(df, reread)
def test_complex_across_dimensions_fixed(tmp_path, setup_path):
complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j])
s = Series(complex128, index=list("abcd"))
df = DataFrame({"A": s, "B": s})
objs = [s, df]
comps = [tm.assert_series_equal, tm.assert_frame_equal]
for obj, comp in zip(objs, comps):
path = tmp_path / setup_path
obj.to_hdf(path, "obj", format="fixed")
reread = read_hdf(path, "obj")
comp(obj, reread)
def test_complex_across_dimensions(tmp_path, setup_path):
complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j])
s = Series(complex128, index=list("abcd"))
df = DataFrame({"A": s, "B": s})
path = tmp_path / setup_path
df.to_hdf(path, "obj", format="table")
reread = read_hdf(path, "obj")
tm.assert_frame_equal(df, reread)
def test_complex_indexing_error(setup_path):
complex128 = np.array(
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128
)
df = DataFrame(
{"A": [1, 2, 3, 4], "B": ["a", "b", "c", "d"], "C": complex128},
index=list("abcd"),
)
msg = (
"Columns containing complex values can be stored "
"but cannot be indexed when using table format. "
"Either use fixed format, set index=False, "
"or do not include the columns containing complex "
"values to data_columns when initializing the table."
)
with ensure_clean_store(setup_path) as store:
with pytest.raises(TypeError, match=msg):
store.append("df", df, data_columns=["C"])
def test_complex_series_error(tmp_path, setup_path):
complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j])
s = Series(complex128, index=list("abcd"))
msg = (
"Columns containing complex values can be stored "
"but cannot be indexed when using table format. "
"Either use fixed format, set index=False, "
"or do not include the columns containing complex "
"values to data_columns when initializing the table."
)
path = tmp_path / setup_path
with pytest.raises(TypeError, match=msg):
s.to_hdf(path, "obj", format="t")
path = tmp_path / setup_path
s.to_hdf(path, "obj", format="t", index=False)
reread = read_hdf(path, "obj")
tm.assert_series_equal(s, reread)
def test_complex_append(setup_path):
df = DataFrame(
{
"a": np.random.default_rng(2).standard_normal(100).astype(np.complex128),
"b": np.random.default_rng(2).standard_normal(100),
}
)
with ensure_clean_store(setup_path) as store:
store.append("df", df, data_columns=["b"])
store.append("df", df)
result = store.select("df")
tm.assert_frame_equal(pd.concat([df, df], axis=0), result)

Some files were not shown because too many files have changed in this diff Show More