Skip to content

Commit 82a27c9

Browse files
committed
Bumping version to 0.0.5
1 parent a4c9815 commit 82a27c9

File tree

10 files changed

+89
-23
lines changed

10 files changed

+89
-23
lines changed

awswrangler/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
__title__ = "awswrangler"
22
__description__ = "Utility belt to handle data on AWS."
3-
__version__ = "0.0.4"
3+
__version__ = "0.0.5"
44
__license__ = "Apache License 2.0"

awswrangler/pandas.py

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ def read_csv(
5050
max_result_size=None,
5151
header="infer",
5252
names=None,
53+
usecols=None,
5354
dtype=None,
5455
sep=",",
5556
lineterminator="\n",
@@ -71,6 +72,7 @@ def read_csv(
7172
:param max_result_size: Max number of bytes on each request to S3
7273
:param header: Same as pandas.read_csv()
7374
:param names: Same as pandas.read_csv()
75+
:param usecols: Same as pandas.read_csv()
7476
:param dtype: Same as pandas.read_csv()
7577
:param sep: Same as pandas.read_csv()
7678
:param lineterminator: Same as pandas.read_csv()
@@ -96,6 +98,7 @@ def read_csv(
9698
max_result_size=max_result_size,
9799
header=header,
98100
names=names,
101+
usecols=usecols,
99102
dtype=dtype,
100103
sep=sep,
101104
lineterminator=lineterminator,
@@ -113,6 +116,7 @@ def read_csv(
113116
key_path=key_path,
114117
header=header,
115118
names=names,
119+
usecols=usecols,
116120
dtype=dtype,
117121
sep=sep,
118122
lineterminator=lineterminator,
@@ -133,6 +137,7 @@ def _read_csv_iterator(
133137
max_result_size=200_000_000, # 200 MB
134138
header="infer",
135139
names=None,
140+
usecols=None,
136141
dtype=None,
137142
sep=",",
138143
lineterminator="\n",
@@ -155,6 +160,7 @@ def _read_csv_iterator(
155160
:param max_result_size: Max number of bytes on each request to S3
156161
:param header: Same as pandas.read_csv()
157162
:param names: Same as pandas.read_csv()
163+
:param usecols: Same as pandas.read_csv()
158164
:param dtype: Same as pandas.read_csv()
159165
:param sep: Same as pandas.read_csv()
160166
:param lineterminator: Same as pandas.read_csv()
@@ -182,6 +188,7 @@ def _read_csv_iterator(
182188
key_path=key_path,
183189
header=header,
184190
names=names,
191+
usecols=usecols,
185192
dtype=dtype,
186193
sep=sep,
187194
lineterminator=lineterminator,
@@ -235,6 +242,7 @@ def _read_csv_iterator(
235242
StringIO(body[:last_char].decode("utf-8")),
236243
header=header,
237244
names=names,
245+
usecols=usecols,
238246
sep=sep,
239247
quotechar=quotechar,
240248
quoting=quoting,
@@ -353,6 +361,7 @@ def _read_csv_once(
353361
key_path,
354362
header="infer",
355363
names=None,
364+
usecols=None,
356365
dtype=None,
357366
sep=",",
358367
lineterminator="\n",
@@ -374,6 +383,7 @@ def _read_csv_once(
374383
:param key_path: S3 key path (W/o bucket)
375384
:param header: Same as pandas.read_csv()
376385
:param names: Same as pandas.read_csv()
386+
:param usecols: Same as pandas.read_csv()
377387
:param dtype: Same as pandas.read_csv()
378388
:param sep: Same as pandas.read_csv()
379389
:param lineterminator: Same as pandas.read_csv()
@@ -395,6 +405,7 @@ def _read_csv_once(
395405
buff,
396406
header=header,
397407
names=names,
408+
usecols=usecols,
398409
sep=sep,
399410
quotechar=quotechar,
400411
quoting=quoting,
@@ -714,7 +725,8 @@ def _data_to_s3_dataset_writer(dataframe,
714725
session_primitives,
715726
file_format,
716727
cast_columns=None,
717-
extra_args=None):
728+
extra_args=None,
729+
isolated_dataframe=False):
718730
objects_paths = []
719731
if not partition_cols:
720732
object_path = Pandas._data_to_s3_object_writer(
@@ -725,7 +737,8 @@ def _data_to_s3_dataset_writer(dataframe,
725737
session_primitives=session_primitives,
726738
file_format=file_format,
727739
cast_columns=cast_columns,
728-
extra_args=extra_args)
740+
extra_args=extra_args,
741+
isolated_dataframe=isolated_dataframe)
729742
objects_paths.append(object_path)
730743
else:
731744
for keys, subgroup in dataframe.groupby(partition_cols):
@@ -744,7 +757,8 @@ def _data_to_s3_dataset_writer(dataframe,
744757
session_primitives=session_primitives,
745758
file_format=file_format,
746759
cast_columns=cast_columns,
747-
extra_args=extra_args)
760+
extra_args=extra_args,
761+
isolated_dataframe=True)
748762
objects_paths.append(object_path)
749763
return objects_paths
750764

@@ -769,7 +783,8 @@ def _data_to_s3_dataset_writer_remote(send_pipe,
769783
session_primitives=session_primitives,
770784
file_format=file_format,
771785
cast_columns=cast_columns,
772-
extra_args=extra_args))
786+
extra_args=extra_args,
787+
isolated_dataframe=True))
773788
send_pipe.close()
774789

775790
@staticmethod
@@ -780,7 +795,8 @@ def _data_to_s3_object_writer(dataframe,
780795
session_primitives,
781796
file_format,
782797
cast_columns=None,
783-
extra_args=None):
798+
extra_args=None,
799+
isolated_dataframe=False):
784800
fs = s3.get_fs(session_primitives=session_primitives)
785801
fs = pyarrow.filesystem._ensure_filesystem(fs)
786802
s3.mkdir_if_not_exists(fs, path)
@@ -803,12 +819,14 @@ def _data_to_s3_object_writer(dataframe,
803819
raise UnsupportedFileFormat(file_format)
804820
object_path = "/".join([path, outfile])
805821
if file_format == "parquet":
806-
Pandas.write_parquet_dataframe(dataframe=dataframe,
807-
path=object_path,
808-
preserve_index=preserve_index,
809-
compression=compression,
810-
fs=fs,
811-
cast_columns=cast_columns)
822+
Pandas.write_parquet_dataframe(
823+
dataframe=dataframe,
824+
path=object_path,
825+
preserve_index=preserve_index,
826+
compression=compression,
827+
fs=fs,
828+
cast_columns=cast_columns,
829+
isolated_dataframe=isolated_dataframe)
812830
elif file_format == "csv":
813831
Pandas.write_csv_dataframe(dataframe=dataframe,
814832
path=object_path,
@@ -848,15 +866,17 @@ def write_csv_dataframe(dataframe,
848866

849867
@staticmethod
850868
def write_parquet_dataframe(dataframe, path, preserve_index, compression,
851-
fs, cast_columns):
869+
fs, cast_columns, isolated_dataframe):
852870
if not cast_columns:
853871
cast_columns = {}
854872

855873
# Casting on Pandas
874+
casted_in_pandas = []
856875
dtypes = copy.deepcopy(dataframe.dtypes.to_dict())
857876
for name, dtype in dtypes.items():
858877
if str(dtype) == "Int64":
859878
dataframe[name] = dataframe[name].astype("float64")
879+
casted_in_pandas.append(name)
860880
cast_columns[name] = "bigint"
861881
logger.debug(f"Casting column {name} Int64 to float64")
862882

@@ -885,6 +905,11 @@ def write_parquet_dataframe(dataframe, path, preserve_index, compression,
885905
coerce_timestamps="ms",
886906
flavor="spark")
887907

908+
# Casting back on Pandas if necessary
909+
if isolated_dataframe is False:
910+
for col in casted_in_pandas:
911+
dataframe[col] = dataframe[col].astype("Int64")
912+
888913
def to_redshift(
889914
self,
890915
dataframe,

building/build-glue-wheel.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#!/bin/bash
2+
set -e
3+
4+
cd ..
5+
rm -rf *.egg-info build dist/*.whl
6+
python3.6 setup.py bdist_wheel
7+
rm -rf *.egg-info build
8+
cd building

building/publish.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,6 @@ set -e
33

44
cd ..
55
rm -fr build dist .egg awswrangler.egg-info
6-
python setup.py sdist bdist_wheel
6+
python setup.py sdist
77
twine upload dist/*
88
rm -fr build dist .egg awswrangler.egg-info
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
awswrangler.data\_types module
2+
==============================
3+
4+
.. automodule:: awswrangler.data_types
5+
:members:
6+
:undoc-members:
7+
:show-inheritance:

docs/source/api/awswrangler.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ Submodules
88

99
awswrangler.athena
1010
awswrangler.cloudwatchlogs
11+
awswrangler.data_types
1112
awswrangler.exceptions
1213
awswrangler.glue
1314
awswrangler.pandas

requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
botocore>=1.12.224
2-
boto3>=1.9.224
1+
botocore>=1.12.238
2+
boto3>=1.9.238
33
pandas>=0.25.1
44
s3fs>=0.3.4
55
pyarrow>=0.14.1

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bdist_wheel]
2-
python-tag = py36,py37
2+
python-tag = glue
33

44
[metadata]
55
license_file = LICENSE

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@
2424
install_requires=[
2525
"pyarrow>=0.14.1",
2626
"pandas>=0.25.1",
27-
"botocore>=1.12.224",
28-
"boto3>=1.9.224",
27+
"botocore>=1.12.238",
28+
"boto3>=1.9.238",
2929
"s3fs>=0.3.4",
3030
"tenacity>=5.1.1",
3131
"pg8000>=1.13.2",

testing/test_awswrangler/test_pandas.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,33 @@ def test_read_csv_iterator(session, bucket, sample, row_num):
126126
assert total_count == row_num
127127

128128

129+
@pytest.mark.parametrize("sample, row_num", [("data_samples/micro.csv", 30),
130+
("data_samples/small.csv", 100)])
131+
def test_read_csv_usecols(session, bucket, sample, row_num):
132+
boto3.client("s3").upload_file(sample, bucket, sample)
133+
path = f"s3://{bucket}/{sample}"
134+
dataframe = session.pandas.read_csv(path=path, usecols=["id", "name"])
135+
session.s3.delete_objects(path=path)
136+
assert len(dataframe.index) == row_num
137+
assert len(dataframe.columns) == 2
138+
139+
140+
@pytest.mark.parametrize("sample, row_num", [("data_samples/micro.csv", 30),
141+
("data_samples/small.csv", 100)])
142+
def test_read_csv_iterator_usecols(session, bucket, sample, row_num):
143+
boto3.client("s3").upload_file(sample, bucket, sample)
144+
path = f"s3://{bucket}/{sample}"
145+
dataframe_iter = session.pandas.read_csv(path=path,
146+
usecols=[0, 1],
147+
max_result_size=200)
148+
total_count = 0
149+
for dataframe in dataframe_iter:
150+
total_count += len(dataframe.index)
151+
assert len(dataframe.columns) == 2
152+
session.s3.delete_objects(path=path)
153+
assert total_count == row_num
154+
155+
129156
@pytest.mark.parametrize(
130157
"mode, file_format, preserve_index, partition_cols, procs_cpu_bound, factor",
131158
[
@@ -745,10 +772,8 @@ def test_to_parquet_with_cast_null(
745772

746773

747774
def test_read_sql_athena_with_time_zone(session, bucket, database):
748-
dataframe = session.pandas.read_sql_athena(
749-
sql=
750-
"select current_timestamp as value, typeof(current_timestamp) as type",
751-
database=database)
775+
query = "select current_timestamp as value, typeof(current_timestamp) as type"
776+
dataframe = session.pandas.read_sql_athena(sql=query, database=database)
752777
assert len(dataframe.index) == 1
753778
assert len(dataframe.columns) == 2
754779
assert dataframe["type"][0] == "timestamp with time zone"

0 commit comments

Comments
 (0)