diff --git a/pygdf/__init__.py b/pygdf/__init__.py index c6082cb8fa0..449c8a5fd17 100644 --- a/pygdf/__init__.py +++ b/pygdf/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) 2018, NVIDIA CORPORATION. from .dataframe import DataFrame +from .index import Index from .series import Series from .multi import concat @@ -10,10 +11,3 @@ from ._version import get_versions __version__ = get_versions()['version'] del get_versions - -__all__ = [ - DataFrame, - Series, - concat, - set_options, -] diff --git a/pygdf/dataframe.py b/pygdf/dataframe.py index bc65a6ee9b1..35b60bbc427 100644 --- a/pygdf/dataframe.py +++ b/pygdf/dataframe.py @@ -86,6 +86,8 @@ def __init__(self, name_series=None, index=None): self._cols = OrderedDict() # has initializer? if name_series is not None: + if isinstance(name_series, dict): + name_series = name_series.items() for k, series in name_series: self.add_column(k, series, forceindex=index is not None) @@ -176,7 +178,9 @@ def __getitem__(self, arg): 3 3 3 """ if isinstance(arg, str) or isinstance(arg, int): - return self._cols[arg] + s = self._cols[arg] + s.name = arg + return s elif isinstance(arg, slice): df = DataFrame() for k, col in self._cols.items(): @@ -213,6 +217,12 @@ def __len__(self): """ return self._size + def assign(self, **kwargs): + new = self.copy() + for k, v in kwargs.items(): + new[k] = v + return new + def head(self, n=5): return self[:n] @@ -294,7 +304,7 @@ def loc(self): def columns(self): """Returns a tuple of columns """ - return tuple(self._cols) + return pd.Index(self._cols) @property def index(self): @@ -415,6 +425,7 @@ def add_column(self, name, data, forceindex=False): raise NameError('duplicated column name {!r}'.format(name)) series = self._prepare_series_for_add(data, forceindex=forceindex) + series.name = name self._cols[name] = series def drop_column(self, name): diff --git a/pygdf/index.py b/pygdf/index.py index 28d68275235..47a642e42b5 100644 --- a/pygdf/index.py +++ b/pygdf/index.py @@ -53,6 +53,15 @@ def to_pandas(self): def gpu_values(self): return self.as_column().to_gpu_array() + def min(self): + return self.as_column().min() + + def max(self): + return self.as_column().max() + + def sum(self): + return self.as_column().sum() + def find_segments(self): """Return the beginning index for segments @@ -100,7 +109,7 @@ def join(self, other, method, how='left', return_indexers=False): class RangeIndex(Index): """Basic start..stop """ - def __init__(self, start, stop=None): + def __init__(self, start, stop=None, name=None): """RangeIndex(size), RangeIndex(start, stop) Parameters @@ -111,6 +120,7 @@ def __init__(self, start, stop=None): start, stop = 0, start self._start = int(start) self._stop = int(stop) + self.name = name def __repr__(self): return "{}(start={}, stop={})".format(self.__class__.__name__, @@ -183,7 +193,7 @@ def index_from_range(start, stop=None, step=None): class GenericIndex(Index): - def __new__(self, values): + def __new__(self, values, name=None): from .series import Series # normalize the input @@ -200,6 +210,7 @@ def __new__(self, values): # Make GenericIndex object res = Index.__new__(GenericIndex) res._values = values + res.name = name return res def serialize(self, serialize): @@ -271,7 +282,7 @@ def find_label_range(self, first, last): class DatetimeIndex(GenericIndex): # TODO this constructor should take a timezone or something to be # consistent with pandas - def __new__(self, values): + def __new__(self, values, name=None): # we should be more strict on what we accept here but # we'd have to go and figure out all the semantics around # pandas dtindex creation first which. For now @@ -285,6 +296,7 @@ def __new__(self, values): # override __new__ properly res = Index.__new__(DatetimeIndex) res._values = values + res.name = name return res @property diff --git a/pygdf/series.py b/pygdf/series.py index d5e866186d1..5e06c7ee190 100644 --- a/pygdf/series.py +++ b/pygdf/series.py @@ -6,6 +6,7 @@ from numbers import Number import numpy as np +import pandas as pd from . import cudautils, formatting from .buffer import Buffer @@ -58,10 +59,15 @@ def from_masked_array(cls, data, mask, null_count=None): col = columnops.as_column(data).set_mask(mask, null_count=null_count) return cls(data=col) - def __init__(self, data, index=None): + def __init__(self, data, index=None, name=None): + if isinstance(data, pd.Series): + name = data.name + index = GenericIndex(data.index) if isinstance(data, Series): index = data._index + name = data.name data = data._column + if not isinstance(data, columnops.TypedColumnBase): data = columnops.as_column(data) @@ -71,6 +77,11 @@ def __init__(self, data, index=None): assert isinstance(data, columnops.TypedColumnBase) self._column = data self._index = RangeIndex(len(data)) if index is None else index + self.name = name + + @classmethod + def from_pandas(cls, s): + return cls(s) def serialize(self, serialize): header = {} @@ -104,6 +115,7 @@ def _copy_construct_defaults(self): return dict( data=self._column, index=self._index, + name=self.name, ) def _copy_construct(self, **kwargs): @@ -306,6 +318,12 @@ def __mul__(self, other): def __rmul__(self, other): return self._rbinaryop(other, 'mul') + def __pow__(self, other): + if other == 2: + return self * self + else: + return NotImplemented + def __floordiv__(self, other): return self._binaryop(other, 'floordiv') @@ -370,8 +388,13 @@ def _concat(cls, objs, index=True): if index is True: index = Index._concat([o.index for o in objs]) + names = {obj.name for obj in objs} + if len(names) == 1: + [name] = names + else: + name = None col = Column._concat([o._column for o in objs]) - return cls(data=col, index=index) + return cls(data=col, index=index, name=name) def append(self, arbitrary): """Append values from another ``Series`` or array-like object. @@ -442,7 +465,9 @@ def to_gpu_array(self, fillna=None): def to_pandas(self, index=True): if index is True: index = self.index.to_pandas() - return self._column.to_pandas(index=index) + s = self._column.to_pandas(index=index) + s.name = self.name + return s @property def data(self): @@ -688,37 +713,44 @@ def find_last_value(self, value): # # Stats # - def count(self): + def count(self, axis=None, skipna=True): """The number of non-null values""" + assert axis in (None, 0) and skipna is True return self.valid_count - def min(self): + def min(self, axis=None, skipna=True): """Compute the min of the series """ + assert axis in (None, 0) and skipna is True return self._column.min() - def max(self): + def max(self, axis=None, skipna=True): """Compute the max of the series """ + assert axis in (None, 0) and skipna is True return self._column.max() - def sum(self): + def sum(self, axis=None, skipna=True): """Compute the sum of the series""" + assert axis in (None, 0) and skipna is True return self._column.sum() - def mean(self): + def mean(self, axis=None, skipna=True): """Compute the mean of the series """ + assert axis in (None, 0) and skipna is True return self._column.mean() - def std(self, ddof=1): + def std(self, ddof=1, axis=None, skipna=True): """Compute the standard deviation of the series """ + assert axis in (None, 0) and skipna is True return np.sqrt(self.var(ddof=ddof)) - def var(self, ddof=1): + def var(self, ddof=1, axis=None, skipna=True): """Compute the variance of the series """ + assert axis in (None, 0) and skipna is True mu, var = self.mean_var(ddof=ddof) return var diff --git a/pygdf/tests/test_dataframe.py b/pygdf/tests/test_dataframe.py index 4c7e15390d3..70441529358 100644 --- a/pygdf/tests/test_dataframe.py +++ b/pygdf/tests/test_dataframe.py @@ -123,7 +123,7 @@ def test_dataframe_basic(): df['vals'] = rnd_vals np.testing.assert_equal(df['vals'].to_array(), rnd_vals) assert len(df) == 10 - assert df.columns == ('keys', 'vals') + assert tuple(df.columns) == ('keys', 'vals') # Make another dataframe df2 = DataFrame() @@ -177,13 +177,13 @@ def test_dataframe_column_add_drop(): data = np.asarray(range(10)) df['a'] = data df['b'] = data - assert df.columns == ('a', 'b') + assert tuple(df.columns) == ('a', 'b') del df['a'] - assert df.columns == ('b',) + assert tuple(df.columns) == ('b',) df['c'] = data - assert df.columns == ('b', 'c') + assert tuple(df.columns) == ('b', 'c') df['a'] = data - assert df.columns == ('b', 'c', 'a') + assert tuple(df.columns) == ('b', 'c', 'a') @pytest.mark.parametrize('nelem', [0, 3, 100, 1000]) @@ -210,7 +210,7 @@ def test_dataframe_slicing(): # Row slice first 10 first_10 = df[:10] assert len(first_10) == 10 - assert first_10.columns == tuple(['a', 'b', 'c', 'd']) + assert tuple(first_10.columns) == ('a', 'b', 'c', 'd') np.testing.assert_equal(first_10['a'].to_array(), ha[:10]) np.testing.assert_equal(first_10['b'].to_array(), hb[:10]) np.testing.assert_equal(first_10['c'].to_array(), hc[:10]) @@ -220,7 +220,7 @@ def test_dataframe_slicing(): # Row slice last 10 last_10 = df[-10:] assert len(last_10) == 10 - assert last_10.columns == tuple(['a', 'b', 'c', 'd']) + assert tuple(last_10.columns) == ('a', 'b', 'c', 'd') np.testing.assert_equal(last_10['a'].to_array(), ha[-10:]) np.testing.assert_equal(last_10['b'].to_array(), hb[-10:]) np.testing.assert_equal(last_10['c'].to_array(), hc[-10:]) @@ -232,7 +232,7 @@ def test_dataframe_slicing(): end = 121 subrange = df[begin:end] assert len(subrange) == end - begin - assert subrange.columns == tuple(['a', 'b', 'c', 'd']) + assert tuple(subrange.columns) == ('a', 'b', 'c', 'd') np.testing.assert_equal(subrange['a'].to_array(), ha[begin:end]) np.testing.assert_equal(subrange['b'].to_array(), hb[begin:end]) np.testing.assert_equal(subrange['c'].to_array(), hc[begin:end]) @@ -252,14 +252,14 @@ def test_dataframe_loc(): # Full slice full = df.loc[:, ['c']] - assert full.columns == tuple(['c']) + assert tuple(full.columns) == ('c',) np.testing.assert_equal(full['c'].to_array(), hc) begin = 117 end = 122 fewer = df.loc[begin:end, ['c', 'd', 'a']] assert len(fewer) == end - begin + 1 - assert fewer.columns == tuple(['c', 'd', 'a']) + assert tuple(fewer.columns) == ('c', 'd', 'a') np.testing.assert_equal(fewer['a'].to_array(), ha[begin:end + 1]) np.testing.assert_equal(fewer['c'].to_array(), hc[begin:end + 1]) np.testing.assert_equal(fewer['d'].to_array(), hd[begin:end + 1]) @@ -272,7 +272,7 @@ def test_dataframe_loc(): end = 122 fewer = df2.loc[begin:end, ['c', 'd', 'a']] assert len(fewer) == end - begin + 1 - assert fewer.columns == tuple(['c', 'd', 'a']) + assert tuple(fewer.columns) == ('c', 'd', 'a') np.testing.assert_equal(fewer['a'].to_array(), ha[begin:end + 1]) np.testing.assert_equal(fewer['c'].to_array(), hc[begin:end + 1]) np.testing.assert_equal(fewer['d'].to_array(), hd[begin:end + 1]) @@ -518,6 +518,15 @@ def test_dataframe_setitem_index_len1(): np.testing.assert_equal(gdf.b.to_array(), [0]) +def test_assign(): + gdf = DataFrame({'x': [1, 2, 3]}) + gdf2 = gdf.assign(y=gdf.x + 1) + assert list(gdf.columns) == ['x'] + assert list(gdf2.columns) == ['x', 'y'] + + np.testing.assert_equal(gdf2.y.to_array(), [2, 3, 4]) + + @pytest.mark.parametrize('nrows', [1, 8, 100, 1000]) def test_dataframe_hash_columns(nrows): gdf = DataFrame() @@ -661,3 +670,26 @@ def do_slice(x): got = do_slice(gdf).to_pandas() pd.testing.assert_frame_equal(expect, got) + + +def test_from_pandas(): + df = pd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.]) + gdf = gd.DataFrame.from_pandas(df) + assert isinstance(gdf, gd.DataFrame) + + pd.testing.assert_frame_equal(df, gdf.to_pandas()) + + s = df.x + gs = gd.Series.from_pandas(s) + assert isinstance(gs, gd.Series) + + pd.testing.assert_series_equal(s, gs.to_pandas()) + + +@pytest.mark.xfail(reason="constructor does not coerce index inputs") +def test_index_in_dataframe_constructor(): + a = pd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.]) + b = gd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.]) + + pd.testing.assert_frame_equal(a, b.to_pandas()) + assert pd.testing.assert_frame_equal(a.loc[4:], b.loc[4:].to_pandas()) diff --git a/pygdf/tests/test_index.py b/pygdf/tests/test_index.py index 822e529d722..68860be6f3e 100644 --- a/pygdf/tests/test_index.py +++ b/pygdf/tests/test_index.py @@ -71,3 +71,20 @@ def test_index_comparision(): assert gi == rg assert rg[:-1] != gi assert rg[:-1] == gi[:-1] + + +@pytest.mark.parametrize('func', [ + lambda x: x.min(), + lambda x: x.max(), + lambda x: x.sum(), +]) +def test_reductions(func): + x = np.asarray([4, 5, 6, 10]) + idx = GenericIndex(np.asarray([4, 5, 6, 10])) + + assert func(x) == func(idx) + + +def test_name(): + idx = GenericIndex(np.asarray([4, 5, 6, 10]), name='foo') + assert idx.name == 'foo' diff --git a/pygdf/tests/test_onehot.py b/pygdf/tests/test_onehot.py index 5d99b95af8d..c73e0ca6b02 100644 --- a/pygdf/tests/test_onehot.py +++ b/pygdf/tests/test_onehot.py @@ -64,7 +64,7 @@ def test_onehot_masked(): out = df.one_hot_encoding('a', cats=list(range(high)), prefix='a', dtype=np.int32) - assert out.columns == tuple(['a', 'a_0', 'a_1', 'a_2', 'a_3', 'a_4']) + assert tuple(out.columns) == ('a', 'a_0', 'a_1', 'a_2', 'a_3', 'a_4') np.testing.assert_array_equal(out['a_0'] == 1, arr == 0) np.testing.assert_array_equal(out['a_1'] == 1, arr == 1) np.testing.assert_array_equal(out['a_2'] == 1, arr == 2) diff --git a/setup.cfg b/setup.cfg index 60dcff452f9..82dd2d8bcd2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,4 +13,4 @@ tag_prefix = v parentdir_prefix = pygdf- [flake8] -exclude = img,notebooks,thirdparty +exclude = img,notebooks,thirdparty,__init__.py