From 5ca62fc353862cdc1164a0d8ec33382844c34226 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 26 Sep 2018 13:29:07 -0700 Subject: [PATCH 01/17] Add assign method to dataframe --- pygdf/dataframe.py | 6 ++++++ pygdf/tests/test_dataframe.py | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/pygdf/dataframe.py b/pygdf/dataframe.py index bc65a6ee9b1..95cb498741d 100644 --- a/pygdf/dataframe.py +++ b/pygdf/dataframe.py @@ -213,6 +213,12 @@ def __len__(self): """ return self._size + def assign(self, **kwargs): + new = self.copy() + for k, v in kwargs.items(): + new[k] = v + return new + def head(self, n=5): return self[:n] diff --git a/pygdf/tests/test_dataframe.py b/pygdf/tests/test_dataframe.py index 4c7e15390d3..8724ab60144 100644 --- a/pygdf/tests/test_dataframe.py +++ b/pygdf/tests/test_dataframe.py @@ -518,6 +518,15 @@ def test_dataframe_setitem_index_len1(): np.testing.assert_equal(gdf.b.to_array(), [0]) +def test_assign(): + gdf = DataFrame({'x': [1, 2, 3]}) + gdf2 = gdf.assign(y=gdf.x + 1) + assert gdf.columns == ['x'] + assert gdf2.columns == ['x', 'y'] + + np.testing.assert_equal(gdf2.y.to_array(), [2, 3, 4]) + + @pytest.mark.parametrize('nrows', [1, 8, 100, 1000]) def test_dataframe_hash_columns(nrows): gdf = DataFrame() From a771fa9c7ae6fe4138bf74f43edcb12b82a9a2d3 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 26 Sep 2018 13:29:23 -0700 Subject: [PATCH 02/17] add pow method to series, but only for the value 2 (sorry for the hack, this was useful for std/var in dask-gdf) --- pygdf/series.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pygdf/series.py b/pygdf/series.py index d5e866186d1..3d4f651f839 100644 --- a/pygdf/series.py +++ b/pygdf/series.py @@ -306,6 +306,12 @@ def __mul__(self, other): def __rmul__(self, other): return self._rbinaryop(other, 'mul') + def __pow__(self, other): + if other == 2: + return self * self + else: + return NotImplemented + def __floordiv__(self, other): return self._binaryop(other, 'floordiv') From 8f8737b8e0d21b28138d45dab30dff76b8507545 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 26 Sep 2018 13:29:54 -0700 Subject: [PATCH 03/17] Add axis and skipna keywords to series reductions These don't do anything currently, and are silently ignored --- pygdf/series.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pygdf/series.py b/pygdf/series.py index 3d4f651f839..b47e685ff72 100644 --- a/pygdf/series.py +++ b/pygdf/series.py @@ -694,35 +694,35 @@ def find_last_value(self, value): # # Stats # - def count(self): + def count(self, axis=None, skipna=True): """The number of non-null values""" return self.valid_count - def min(self): + def min(self, axis=None, skipna=True): """Compute the min of the series """ return self._column.min() - def max(self): + def max(self, axis=None, skipna=True): """Compute the max of the series """ return self._column.max() - def sum(self): + def sum(self, axis=None, skipna=True): """Compute the sum of the series""" return self._column.sum() - def mean(self): + def mean(self, axis=None, skipna=True): """Compute the mean of the series """ return self._column.mean() - def std(self, ddof=1): + def std(self, ddof=1, axis=None, skipna=True): """Compute the standard deviation of the series """ return np.sqrt(self.var(ddof=ddof)) - def var(self, ddof=1): + def var(self, ddof=1, axis=None, skipna=True): """Compute the variance of the series """ mu, var = self.mean_var(ddof=ddof) From 50ba8a62b564934efc0b59434e420dfea5f3f767 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 26 Sep 2018 16:24:38 -0700 Subject: [PATCH 04/17] assert axis and skipna in Series reductions --- pygdf/series.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pygdf/series.py b/pygdf/series.py index b47e685ff72..8843f77bea7 100644 --- a/pygdf/series.py +++ b/pygdf/series.py @@ -696,35 +696,42 @@ def find_last_value(self, value): # def count(self, axis=None, skipna=True): """The number of non-null values""" + assert axis in (None, 0) and skipna is True return self.valid_count def min(self, axis=None, skipna=True): """Compute the min of the series """ + assert axis in (None, 0) and skipna is True return self._column.min() def max(self, axis=None, skipna=True): """Compute the max of the series """ + assert axis in (None, 0) and skipna is True return self._column.max() def sum(self, axis=None, skipna=True): """Compute the sum of the series""" + assert axis in (None, 0) and skipna is True return self._column.sum() def mean(self, axis=None, skipna=True): """Compute the mean of the series """ + assert axis in (None, 0) and skipna is True return self._column.mean() def std(self, ddof=1, axis=None, skipna=True): """Compute the standard deviation of the series """ + assert axis in (None, 0) and skipna is True return np.sqrt(self.var(ddof=ddof)) def var(self, ddof=1, axis=None, skipna=True): """Compute the variance of the series """ + assert axis in (None, 0) and skipna is True mu, var = self.mean_var(ddof=ddof) return var From 46cb1d4baf111055d35b1b3e6c82de63ad120eae Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 26 Sep 2018 16:28:05 -0700 Subject: [PATCH 05/17] fix assign test --- pygdf/dataframe.py | 2 ++ pygdf/tests/test_dataframe.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pygdf/dataframe.py b/pygdf/dataframe.py index 95cb498741d..194bd905c38 100644 --- a/pygdf/dataframe.py +++ b/pygdf/dataframe.py @@ -86,6 +86,8 @@ def __init__(self, name_series=None, index=None): self._cols = OrderedDict() # has initializer? if name_series is not None: + if isinstance(name_series, dict): + name_series = name_series.items() for k, series in name_series: self.add_column(k, series, forceindex=index is not None) diff --git a/pygdf/tests/test_dataframe.py b/pygdf/tests/test_dataframe.py index 8724ab60144..fee63a700f4 100644 --- a/pygdf/tests/test_dataframe.py +++ b/pygdf/tests/test_dataframe.py @@ -521,8 +521,8 @@ def test_dataframe_setitem_index_len1(): def test_assign(): gdf = DataFrame({'x': [1, 2, 3]}) gdf2 = gdf.assign(y=gdf.x + 1) - assert gdf.columns == ['x'] - assert gdf2.columns == ['x', 'y'] + assert list(gdf.columns) == ['x'] + assert list(gdf2.columns) == ['x', 'y'] np.testing.assert_equal(gdf2.y.to_array(), [2, 3, 4]) From 84b14c435ea4a2c2ba2a228e614d9e5786fef38a Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 26 Sep 2018 16:44:45 -0700 Subject: [PATCH 06/17] Add Series.name and Series.from_pandas --- pygdf/dataframe.py | 1 + pygdf/series.py | 22 +++++++++++++++++++++- pygdf/tests/test_dataframe.py | 20 ++++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/pygdf/dataframe.py b/pygdf/dataframe.py index 194bd905c38..213b2d634ad 100644 --- a/pygdf/dataframe.py +++ b/pygdf/dataframe.py @@ -423,6 +423,7 @@ def add_column(self, name, data, forceindex=False): raise NameError('duplicated column name {!r}'.format(name)) series = self._prepare_series_for_add(data, forceindex=forceindex) + series.name = name self._cols[name] = series def drop_column(self, name): diff --git a/pygdf/series.py b/pygdf/series.py index 8843f77bea7..a2ccbab863f 100644 --- a/pygdf/series.py +++ b/pygdf/series.py @@ -6,6 +6,7 @@ from numbers import Number import numpy as np +import pandas as pd from . import cudautils, formatting from .buffer import Buffer @@ -59,9 +60,21 @@ def from_masked_array(cls, data, mask, null_count=None): return cls(data=col) def __init__(self, data, index=None): + name = None + if isinstance(data, pd.Series): + from .dataframe import DataFrame + inp = data + name = data.name + data = data.to_frame() + data.columns = ['x'] + data = DataFrame.from_pandas(data) + data = data['x'] + data.name = name if isinstance(data, Series): index = data._index + name = data.name data = data._column + if not isinstance(data, columnops.TypedColumnBase): data = columnops.as_column(data) @@ -71,6 +84,11 @@ def __init__(self, data, index=None): assert isinstance(data, columnops.TypedColumnBase) self._column = data self._index = RangeIndex(len(data)) if index is None else index + self.name = name + + @classmethod + def from_pandas(cls, s): + return cls(s) def serialize(self, serialize): header = {} @@ -448,7 +466,9 @@ def to_gpu_array(self, fillna=None): def to_pandas(self, index=True): if index is True: index = self.index.to_pandas() - return self._column.to_pandas(index=index) + s = self._column.to_pandas(index=index) + s.name = self.name + return s @property def data(self): diff --git a/pygdf/tests/test_dataframe.py b/pygdf/tests/test_dataframe.py index fee63a700f4..ecc18e822f4 100644 --- a/pygdf/tests/test_dataframe.py +++ b/pygdf/tests/test_dataframe.py @@ -670,3 +670,23 @@ def do_slice(x): got = do_slice(gdf).to_pandas() pd.testing.assert_frame_equal(expect, got) + + +def test_from_pandas(): + df = pd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.]) + gdf = gd.DataFrame.from_pandas(df) + assert isinstance(gdf, gd.DataFrame) + + pd.testing.assert_frame_equal(df, gdf.to_pandas()) + + s = df.x + gs = gd.Series.from_pandas(s) + assert isinstance(gs, gd.Series) + + pd.testing.assert_series_equal(s, gs.to_pandas()) + + +def test_series_name(): + df = pd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.]) + gdf = gd.DataFrame.from_pandas(df) + assert gdf['x'].name == 'x' From d1bc16e5d09393477a203f9e1771fc58ee8eef9b Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Wed, 26 Sep 2018 16:44:45 -0700 Subject: [PATCH 07/17] Add Series.name and Series.from_pandas --- pygdf/dataframe.py | 1 + pygdf/series.py | 21 ++++++++++++++++++++- pygdf/tests/test_dataframe.py | 20 ++++++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/pygdf/dataframe.py b/pygdf/dataframe.py index 194bd905c38..213b2d634ad 100644 --- a/pygdf/dataframe.py +++ b/pygdf/dataframe.py @@ -423,6 +423,7 @@ def add_column(self, name, data, forceindex=False): raise NameError('duplicated column name {!r}'.format(name)) series = self._prepare_series_for_add(data, forceindex=forceindex) + series.name = name self._cols[name] = series def drop_column(self, name): diff --git a/pygdf/series.py b/pygdf/series.py index 8843f77bea7..81bee059e63 100644 --- a/pygdf/series.py +++ b/pygdf/series.py @@ -6,6 +6,7 @@ from numbers import Number import numpy as np +import pandas as pd from . import cudautils, formatting from .buffer import Buffer @@ -59,9 +60,20 @@ def from_masked_array(cls, data, mask, null_count=None): return cls(data=col) def __init__(self, data, index=None): + name = None + if isinstance(data, pd.Series): + from .dataframe import DataFrame + name = data.name + data = data.to_frame() + data.columns = ['x'] + data = DataFrame.from_pandas(data) + data = data['x'] + data.name = name if isinstance(data, Series): index = data._index + name = data.name data = data._column + if not isinstance(data, columnops.TypedColumnBase): data = columnops.as_column(data) @@ -71,6 +83,11 @@ def __init__(self, data, index=None): assert isinstance(data, columnops.TypedColumnBase) self._column = data self._index = RangeIndex(len(data)) if index is None else index + self.name = name + + @classmethod + def from_pandas(cls, s): + return cls(s) def serialize(self, serialize): header = {} @@ -448,7 +465,9 @@ def to_gpu_array(self, fillna=None): def to_pandas(self, index=True): if index is True: index = self.index.to_pandas() - return self._column.to_pandas(index=index) + s = self._column.to_pandas(index=index) + s.name = self.name + return s @property def data(self): diff --git a/pygdf/tests/test_dataframe.py b/pygdf/tests/test_dataframe.py index fee63a700f4..ecc18e822f4 100644 --- a/pygdf/tests/test_dataframe.py +++ b/pygdf/tests/test_dataframe.py @@ -670,3 +670,23 @@ def do_slice(x): got = do_slice(gdf).to_pandas() pd.testing.assert_frame_equal(expect, got) + + +def test_from_pandas(): + df = pd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.]) + gdf = gd.DataFrame.from_pandas(df) + assert isinstance(gdf, gd.DataFrame) + + pd.testing.assert_frame_equal(df, gdf.to_pandas()) + + s = df.x + gs = gd.Series.from_pandas(s) + assert isinstance(gs, gd.Series) + + pd.testing.assert_series_equal(s, gs.to_pandas()) + + +def test_series_name(): + df = pd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.]) + gdf = gd.DataFrame.from_pandas(df) + assert gdf['x'].name == 'x' From b4e26a32e5ffdeb218c8ff1f8f31b7f7d57427e5 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 27 Sep 2018 05:01:08 -0700 Subject: [PATCH 08/17] Use normal Series/Index constructors --- pygdf/series.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pygdf/series.py b/pygdf/series.py index 81bee059e63..22a51b6a024 100644 --- a/pygdf/series.py +++ b/pygdf/series.py @@ -62,13 +62,8 @@ def from_masked_array(cls, data, mask, null_count=None): def __init__(self, data, index=None): name = None if isinstance(data, pd.Series): - from .dataframe import DataFrame name = data.name - data = data.to_frame() - data.columns = ['x'] - data = DataFrame.from_pandas(data) - data = data['x'] - data.name = name + index = GenericIndex(data.index) if isinstance(data, Series): index = data._index name = data.name From 4ef79cc4d43fd618ce2d81b71e7a332c253224c5 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 27 Sep 2018 05:01:38 -0700 Subject: [PATCH 09/17] import Index at top level Fixes #259 --- pygdf/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pygdf/__init__.py b/pygdf/__init__.py index c6082cb8fa0..242ca426c85 100644 --- a/pygdf/__init__.py +++ b/pygdf/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) 2018, NVIDIA CORPORATION. from .dataframe import DataFrame +from .index import Index from .series import Series from .multi import concat From 761f5d33da1d83954f4ed3e2208345dcdd06f1b3 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 27 Sep 2018 05:03:55 -0700 Subject: [PATCH 10/17] remove __all__ from __init__.py We were listing all locals in the file, so it doesn't seem to accomplish much. It does however require people editing this file to make two changes rather than one. --- pygdf/__init__.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pygdf/__init__.py b/pygdf/__init__.py index 242ca426c85..449c8a5fd17 100644 --- a/pygdf/__init__.py +++ b/pygdf/__init__.py @@ -11,10 +11,3 @@ from ._version import get_versions __version__ = get_versions()['version'] del get_versions - -__all__ = [ - DataFrame, - Series, - concat, - set_options, -] From 7c672fb18f0e114033aa35a1ed8ae3b982e5ebdc Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 27 Sep 2018 05:09:19 -0700 Subject: [PATCH 11/17] add failing test for index coercion --- pygdf/tests/test_dataframe.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pygdf/tests/test_dataframe.py b/pygdf/tests/test_dataframe.py index ecc18e822f4..0323d4d0d12 100644 --- a/pygdf/tests/test_dataframe.py +++ b/pygdf/tests/test_dataframe.py @@ -690,3 +690,12 @@ def test_series_name(): df = pd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.]) gdf = gd.DataFrame.from_pandas(df) assert gdf['x'].name == 'x' + + +@pytest.mark.xfail(reason="constructor does not coerce index inputs") +def test_index_in_dataframe_constructor(): + a = pd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.]) + b = gd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.]) + + pd.testing.assert_frame_equal(a, b.to_pandas()) + assert pd.testing.assert_frame_equal(a.loc[4:], b.loc[4:].to_pandas()) From c2675e566f9399917294209b9b6b8faf85b7a25f Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 27 Sep 2018 05:33:27 -0700 Subject: [PATCH 12/17] add basic reductions to Index --- pygdf/index.py | 9 +++++++++ pygdf/tests/test_index.py | 12 ++++++++++++ 2 files changed, 21 insertions(+) diff --git a/pygdf/index.py b/pygdf/index.py index 28d68275235..df633d62867 100644 --- a/pygdf/index.py +++ b/pygdf/index.py @@ -53,6 +53,15 @@ def to_pandas(self): def gpu_values(self): return self.as_column().to_gpu_array() + def min(self): + return self.as_column().min() + + def max(self): + return self.as_column().max() + + def sum(self): + return self.as_column().sum() + def find_segments(self): """Return the beginning index for segments diff --git a/pygdf/tests/test_index.py b/pygdf/tests/test_index.py index 822e529d722..0baf2b5c61c 100644 --- a/pygdf/tests/test_index.py +++ b/pygdf/tests/test_index.py @@ -71,3 +71,15 @@ def test_index_comparision(): assert gi == rg assert rg[:-1] != gi assert rg[:-1] == gi[:-1] + + +@pytest.mark.parametrize('func', [ + lambda x: x.min(), + lambda x: x.max(), + lambda x: x.sum(), +]) +def test_index_find_label_range(func): + x = np.asarray([4, 5, 6, 10]) + idx = GenericIndex(np.asarray([4, 5, 6, 10])) + + assert func(x) == func(idx) From 6dca2ef14951e381b9ffa4058891c8a15c4468a7 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 27 Sep 2018 05:41:29 -0700 Subject: [PATCH 13/17] add name to index --- pygdf/index.py | 9 ++++++--- pygdf/tests/test_index.py | 7 +++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/pygdf/index.py b/pygdf/index.py index df633d62867..47a642e42b5 100644 --- a/pygdf/index.py +++ b/pygdf/index.py @@ -109,7 +109,7 @@ def join(self, other, method, how='left', return_indexers=False): class RangeIndex(Index): """Basic start..stop """ - def __init__(self, start, stop=None): + def __init__(self, start, stop=None, name=None): """RangeIndex(size), RangeIndex(start, stop) Parameters @@ -120,6 +120,7 @@ def __init__(self, start, stop=None): start, stop = 0, start self._start = int(start) self._stop = int(stop) + self.name = name def __repr__(self): return "{}(start={}, stop={})".format(self.__class__.__name__, @@ -192,7 +193,7 @@ def index_from_range(start, stop=None, step=None): class GenericIndex(Index): - def __new__(self, values): + def __new__(self, values, name=None): from .series import Series # normalize the input @@ -209,6 +210,7 @@ def __new__(self, values): # Make GenericIndex object res = Index.__new__(GenericIndex) res._values = values + res.name = name return res def serialize(self, serialize): @@ -280,7 +282,7 @@ def find_label_range(self, first, last): class DatetimeIndex(GenericIndex): # TODO this constructor should take a timezone or something to be # consistent with pandas - def __new__(self, values): + def __new__(self, values, name=None): # we should be more strict on what we accept here but # we'd have to go and figure out all the semantics around # pandas dtindex creation first which. For now @@ -294,6 +296,7 @@ def __new__(self, values): # override __new__ properly res = Index.__new__(DatetimeIndex) res._values = values + res.name = name return res @property diff --git a/pygdf/tests/test_index.py b/pygdf/tests/test_index.py index 0baf2b5c61c..7dfbee46fa4 100644 --- a/pygdf/tests/test_index.py +++ b/pygdf/tests/test_index.py @@ -83,3 +83,10 @@ def test_index_find_label_range(func): idx = GenericIndex(np.asarray([4, 5, 6, 10])) assert func(x) == func(idx) + + +def test_name(): + x = np.asarray([4, 5, 6, 10]) + idx = GenericIndex(np.asarray([4, 5, 6, 10]), name='foo') + assert idx.name == 'foo' + From 68754662f0b3e893ba88b1bb4ab31a8195616269 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 27 Sep 2018 06:15:55 -0700 Subject: [PATCH 14/17] improve support for name --- pygdf/dataframe.py | 4 +++- pygdf/series.py | 11 ++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/pygdf/dataframe.py b/pygdf/dataframe.py index 213b2d634ad..f3a7d4691bb 100644 --- a/pygdf/dataframe.py +++ b/pygdf/dataframe.py @@ -178,7 +178,9 @@ def __getitem__(self, arg): 3 3 3 """ if isinstance(arg, str) or isinstance(arg, int): - return self._cols[arg] + s = self._cols[arg] + assert s.name == arg + return s elif isinstance(arg, slice): df = DataFrame() for k, col in self._cols.items(): diff --git a/pygdf/series.py b/pygdf/series.py index 22a51b6a024..5e06c7ee190 100644 --- a/pygdf/series.py +++ b/pygdf/series.py @@ -59,8 +59,7 @@ def from_masked_array(cls, data, mask, null_count=None): col = columnops.as_column(data).set_mask(mask, null_count=null_count) return cls(data=col) - def __init__(self, data, index=None): - name = None + def __init__(self, data, index=None, name=None): if isinstance(data, pd.Series): name = data.name index = GenericIndex(data.index) @@ -116,6 +115,7 @@ def _copy_construct_defaults(self): return dict( data=self._column, index=self._index, + name=self.name, ) def _copy_construct(self, **kwargs): @@ -388,8 +388,13 @@ def _concat(cls, objs, index=True): if index is True: index = Index._concat([o.index for o in objs]) + names = {obj.name for obj in objs} + if len(names) == 1: + [name] = names + else: + name = None col = Column._concat([o._column for o in objs]) - return cls(data=col, index=index) + return cls(data=col, index=index, name=name) def append(self, arbitrary): """Append values from another ``Series`` or array-like object. From 1d4e83f611d65c2f818c3b12d4a8f73642428193 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 27 Sep 2018 06:17:43 -0700 Subject: [PATCH 15/17] flake8 --- pygdf/tests/test_dataframe.py | 6 ------ pygdf/tests/test_index.py | 4 +--- setup.cfg | 2 +- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/pygdf/tests/test_dataframe.py b/pygdf/tests/test_dataframe.py index 0323d4d0d12..acb440ce429 100644 --- a/pygdf/tests/test_dataframe.py +++ b/pygdf/tests/test_dataframe.py @@ -686,12 +686,6 @@ def test_from_pandas(): pd.testing.assert_series_equal(s, gs.to_pandas()) -def test_series_name(): - df = pd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.]) - gdf = gd.DataFrame.from_pandas(df) - assert gdf['x'].name == 'x' - - @pytest.mark.xfail(reason="constructor does not coerce index inputs") def test_index_in_dataframe_constructor(): a = pd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.]) diff --git a/pygdf/tests/test_index.py b/pygdf/tests/test_index.py index 7dfbee46fa4..68860be6f3e 100644 --- a/pygdf/tests/test_index.py +++ b/pygdf/tests/test_index.py @@ -78,7 +78,7 @@ def test_index_comparision(): lambda x: x.max(), lambda x: x.sum(), ]) -def test_index_find_label_range(func): +def test_reductions(func): x = np.asarray([4, 5, 6, 10]) idx = GenericIndex(np.asarray([4, 5, 6, 10])) @@ -86,7 +86,5 @@ def test_index_find_label_range(func): def test_name(): - x = np.asarray([4, 5, 6, 10]) idx = GenericIndex(np.asarray([4, 5, 6, 10]), name='foo') assert idx.name == 'foo' - diff --git a/setup.cfg b/setup.cfg index 60dcff452f9..82dd2d8bcd2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,4 +13,4 @@ tag_prefix = v parentdir_prefix = pygdf- [flake8] -exclude = img,notebooks,thirdparty +exclude = img,notebooks,thirdparty,__init__.py From a938ba83d9e0c6bc20f357d28aed47265d128e27 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 27 Sep 2018 11:21:41 -0700 Subject: [PATCH 16/17] set Series name --- pygdf/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pygdf/dataframe.py b/pygdf/dataframe.py index f3a7d4691bb..1e59294efa7 100644 --- a/pygdf/dataframe.py +++ b/pygdf/dataframe.py @@ -179,7 +179,7 @@ def __getitem__(self, arg): """ if isinstance(arg, str) or isinstance(arg, int): s = self._cols[arg] - assert s.name == arg + s.name = arg return s elif isinstance(arg, slice): df = DataFrame() From 64abb9bff5a3c784c3859814139f3421ea383930 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 27 Sep 2018 11:23:01 -0700 Subject: [PATCH 17/17] Use Pandas Index for columns attribute --- pygdf/dataframe.py | 2 +- pygdf/tests/test_dataframe.py | 22 +++++++++++----------- pygdf/tests/test_onehot.py | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pygdf/dataframe.py b/pygdf/dataframe.py index 1e59294efa7..35b60bbc427 100644 --- a/pygdf/dataframe.py +++ b/pygdf/dataframe.py @@ -304,7 +304,7 @@ def loc(self): def columns(self): """Returns a tuple of columns """ - return tuple(self._cols) + return pd.Index(self._cols) @property def index(self): diff --git a/pygdf/tests/test_dataframe.py b/pygdf/tests/test_dataframe.py index acb440ce429..70441529358 100644 --- a/pygdf/tests/test_dataframe.py +++ b/pygdf/tests/test_dataframe.py @@ -123,7 +123,7 @@ def test_dataframe_basic(): df['vals'] = rnd_vals np.testing.assert_equal(df['vals'].to_array(), rnd_vals) assert len(df) == 10 - assert df.columns == ('keys', 'vals') + assert tuple(df.columns) == ('keys', 'vals') # Make another dataframe df2 = DataFrame() @@ -177,13 +177,13 @@ def test_dataframe_column_add_drop(): data = np.asarray(range(10)) df['a'] = data df['b'] = data - assert df.columns == ('a', 'b') + assert tuple(df.columns) == ('a', 'b') del df['a'] - assert df.columns == ('b',) + assert tuple(df.columns) == ('b',) df['c'] = data - assert df.columns == ('b', 'c') + assert tuple(df.columns) == ('b', 'c') df['a'] = data - assert df.columns == ('b', 'c', 'a') + assert tuple(df.columns) == ('b', 'c', 'a') @pytest.mark.parametrize('nelem', [0, 3, 100, 1000]) @@ -210,7 +210,7 @@ def test_dataframe_slicing(): # Row slice first 10 first_10 = df[:10] assert len(first_10) == 10 - assert first_10.columns == tuple(['a', 'b', 'c', 'd']) + assert tuple(first_10.columns) == ('a', 'b', 'c', 'd') np.testing.assert_equal(first_10['a'].to_array(), ha[:10]) np.testing.assert_equal(first_10['b'].to_array(), hb[:10]) np.testing.assert_equal(first_10['c'].to_array(), hc[:10]) @@ -220,7 +220,7 @@ def test_dataframe_slicing(): # Row slice last 10 last_10 = df[-10:] assert len(last_10) == 10 - assert last_10.columns == tuple(['a', 'b', 'c', 'd']) + assert tuple(last_10.columns) == ('a', 'b', 'c', 'd') np.testing.assert_equal(last_10['a'].to_array(), ha[-10:]) np.testing.assert_equal(last_10['b'].to_array(), hb[-10:]) np.testing.assert_equal(last_10['c'].to_array(), hc[-10:]) @@ -232,7 +232,7 @@ def test_dataframe_slicing(): end = 121 subrange = df[begin:end] assert len(subrange) == end - begin - assert subrange.columns == tuple(['a', 'b', 'c', 'd']) + assert tuple(subrange.columns) == ('a', 'b', 'c', 'd') np.testing.assert_equal(subrange['a'].to_array(), ha[begin:end]) np.testing.assert_equal(subrange['b'].to_array(), hb[begin:end]) np.testing.assert_equal(subrange['c'].to_array(), hc[begin:end]) @@ -252,14 +252,14 @@ def test_dataframe_loc(): # Full slice full = df.loc[:, ['c']] - assert full.columns == tuple(['c']) + assert tuple(full.columns) == ('c',) np.testing.assert_equal(full['c'].to_array(), hc) begin = 117 end = 122 fewer = df.loc[begin:end, ['c', 'd', 'a']] assert len(fewer) == end - begin + 1 - assert fewer.columns == tuple(['c', 'd', 'a']) + assert tuple(fewer.columns) == ('c', 'd', 'a') np.testing.assert_equal(fewer['a'].to_array(), ha[begin:end + 1]) np.testing.assert_equal(fewer['c'].to_array(), hc[begin:end + 1]) np.testing.assert_equal(fewer['d'].to_array(), hd[begin:end + 1]) @@ -272,7 +272,7 @@ def test_dataframe_loc(): end = 122 fewer = df2.loc[begin:end, ['c', 'd', 'a']] assert len(fewer) == end - begin + 1 - assert fewer.columns == tuple(['c', 'd', 'a']) + assert tuple(fewer.columns) == ('c', 'd', 'a') np.testing.assert_equal(fewer['a'].to_array(), ha[begin:end + 1]) np.testing.assert_equal(fewer['c'].to_array(), hc[begin:end + 1]) np.testing.assert_equal(fewer['d'].to_array(), hd[begin:end + 1]) diff --git a/pygdf/tests/test_onehot.py b/pygdf/tests/test_onehot.py index 5d99b95af8d..c73e0ca6b02 100644 --- a/pygdf/tests/test_onehot.py +++ b/pygdf/tests/test_onehot.py @@ -64,7 +64,7 @@ def test_onehot_masked(): out = df.one_hot_encoding('a', cats=list(range(high)), prefix='a', dtype=np.int32) - assert out.columns == tuple(['a', 'a_0', 'a_1', 'a_2', 'a_3', 'a_4']) + assert tuple(out.columns) == ('a', 'a_0', 'a_1', 'a_2', 'a_3', 'a_4') np.testing.assert_array_equal(out['a_0'] == 1, arr == 0) np.testing.assert_array_equal(out['a_1'] == 1, arr == 1) np.testing.assert_array_equal(out['a_2'] == 1, arr == 2)