-
Notifications
You must be signed in to change notification settings - Fork 273
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a simple test suite for doing downstream tests
- Loading branch information
1 parent
c8ce8a5
commit 0d19474
Showing
2 changed files
with
201 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
import pytest | ||
from pytest import fixture | ||
import pandas as pd | ||
import numpy as np | ||
import time | ||
import pyarrow as pa | ||
import dask.dataframe as dd | ||
import s3fs | ||
import moto.server | ||
import sys | ||
|
||
|
||
@fixture(scope="session") | ||
def partitioned_dataset() -> dict: | ||
rows = 500000 | ||
cds_df = pd.DataFrame( | ||
{ | ||
"id": range(rows), | ||
"part_key": np.random.choice(["A", "B", "C", "D"], rows), | ||
"timestamp": np.random.randint(1051638817, 1551638817, rows), | ||
"int_value": np.random.randint(0, 60000, rows), | ||
} | ||
) | ||
return {"dataframe": cds_df, "partitioning_column": "part_key"} | ||
|
||
|
||
def free_port(): | ||
import socketserver | ||
|
||
with socketserver.TCPServer(("localhost", 0), None) as s: | ||
free_port = s.server_address[1] | ||
return free_port | ||
|
||
|
||
@fixture(scope="session") | ||
def moto_server(): | ||
import subprocess | ||
|
||
port = free_port() | ||
process = subprocess.Popen([ | ||
sys.executable, | ||
moto.server.__file__, | ||
'--port', str(port), | ||
'--host', 'localhost', | ||
's3' | ||
]) | ||
|
||
s3fs_kwargs = dict( | ||
client_kwargs={"endpoint_url": f'http://localhost:{port}'}, | ||
) | ||
|
||
start = time.time() | ||
while True: | ||
try: | ||
fs = s3fs.S3FileSystem(skip_instance_cache=True, **s3fs_kwargs) | ||
fs.ls("/") | ||
except: | ||
if time.time() - start > 30: | ||
raise TimeoutError("Could not get a working moto server in time") | ||
time.sleep(0.1) | ||
|
||
break | ||
|
||
yield s3fs_kwargs | ||
|
||
process.terminate() | ||
|
||
|
||
@fixture(scope="session") | ||
def moto_s3fs(moto_server): | ||
return s3fs.S3FileSystem(**moto_server) | ||
|
||
|
||
@fixture(scope="session") | ||
def s3_bucket(moto_server): | ||
test_bucket_name = 'test' | ||
from botocore.session import Session | ||
# NB: we use the sync botocore client for setup | ||
session = Session() | ||
client = session.create_client('s3', **moto_server['client_kwargs']) | ||
client.create_bucket(Bucket=test_bucket_name, ACL='public-read') | ||
return test_bucket_name | ||
|
||
|
||
@fixture(scope="session") | ||
def partitioned_parquet_path(partitioned_dataset, moto_s3fs, s3_bucket): | ||
cds_df = partitioned_dataset["dataframe"] | ||
table = pa.Table.from_pandas(cds_df, preserve_index=False) | ||
path = s3_bucket + "/partitioned/dataset" | ||
import pyarrow.parquet | ||
|
||
pyarrow.parquet.write_to_dataset( | ||
table, | ||
path, | ||
filesystem=moto_s3fs, | ||
partition_cols=[ | ||
partitioned_dataset["partitioning_column"] | ||
], # new parameter included | ||
) | ||
|
||
# storage_options = dict(use_listings_cache=False) | ||
# storage_options.update(docker_aws_s3.s3fs_kwargs) | ||
# | ||
# import dask.dataframe | ||
# | ||
# ddf = dask.dataframe.read_parquet( | ||
# f"s3://{path}", storage_options=storage_options, gather_statistics=False | ||
# ) | ||
# all_rows = ddf.compute() | ||
# assert "name" in all_rows.columns | ||
return path | ||
|
||
|
||
@pytest.fixture(scope='session', params=[ | ||
pytest.param("pyarrow"), | ||
pytest.param("fastparquet"), | ||
]) | ||
def parquet_engine(request): | ||
return request.param | ||
|
||
|
||
@pytest.fixture(scope='session', params=[ | ||
pytest.param(False, id='gather_statistics=F'), | ||
pytest.param(True, id='gather_statistics=T'), | ||
]) | ||
def gather_statistics(request): | ||
return request.param | ||
|
||
|
||
def test_partitioned_read(partitioned_dataset, partitioned_parquet_path, moto_server, parquet_engine, gather_statistics): | ||
"""The directory based reading is quite finicky""" | ||
storage_options = moto_server.copy() | ||
ddf = dd.read_parquet( | ||
f"s3://{partitioned_parquet_path}", | ||
storage_options=storage_options, | ||
gather_statistics=gather_statistics, | ||
engine=parquet_engine | ||
) | ||
|
||
assert 'part_key' in ddf.columns | ||
actual = ddf.compute().sort_values('id') | ||
|
||
assert actual == partitioned_dataset["dataframe"] | ||
|
||
|
||
def test_non_partitioned_read(partitioned_dataset, partitioned_parquet_path, moto_server, parquet_engine, gather_statistics): | ||
"""The directory based reading is quite finicky""" | ||
storage_options = moto_server.copy() | ||
ddf = dd.read_parquet( | ||
f"s3://{partitioned_parquet_path}/part_key=A", | ||
storage_options=storage_options, | ||
gather_statistics=gather_statistics, | ||
engine=parquet_engine | ||
) | ||
|
||
if parquet_engine == 'pyarrow': | ||
assert 'part_key' in ddf.columns | ||
actual = ddf.compute().sort_values('id') | ||
expected = partitioned_dataset["dataframe"] | ||
expected = expected.loc[expected.part_key == "A"] | ||
|
||
assert actual == expected |