rapidsai · kkraus14 · Sep 27, 2018 · Sep 26, 2018 · Sep 26, 2018 · Sep 26, 2018
diff --git a/pygdf/__init__.py b/pygdf/__init__.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2018, NVIDIA CORPORATION.
 from .dataframe import DataFrame
+from .index import Index
 from .series import Series
 from .multi import concat
 
@@ -10,10 +11,3 @@
 from ._version import get_versions
 __version__ = get_versions()['version']
 del get_versions
-
-__all__ = [
-    DataFrame,
-    Series,
-    concat,
-    set_options,
-]
diff --git a/pygdf/dataframe.py b/pygdf/dataframe.py
@@ -86,6 +86,8 @@ def __init__(self, name_series=None, index=None):
         self._cols = OrderedDict()
         # has initializer?
         if name_series is not None:
+            if isinstance(name_series, dict):
+                name_series = name_series.items()
             for k, series in name_series:
                 self.add_column(k, series, forceindex=index is not None)
 
@@ -176,7 +178,9 @@ def __getitem__(self, arg):
         3    3    3
         """
         if isinstance(arg, str) or isinstance(arg, int):
-            return self._cols[arg]
+            s = self._cols[arg]
+            s.name = arg
+            return s
         elif isinstance(arg, slice):
             df = DataFrame()
             for k, col in self._cols.items():
@@ -213,6 +217,12 @@ def __len__(self):
         """
         return self._size
 
+    def assign(self, **kwargs):
+        new = self.copy()
+        for k, v in kwargs.items():
+            new[k] = v
+        return new
+
     def head(self, n=5):
         return self[:n]
 
@@ -294,7 +304,7 @@ def loc(self):
     def columns(self):
         """Returns a tuple of columns
         """
-        return tuple(self._cols)
+        return pd.Index(self._cols)
 
     @property
     def index(self):
@@ -415,6 +425,7 @@ def add_column(self, name, data, forceindex=False):
             raise NameError('duplicated column name {!r}'.format(name))
 
         series = self._prepare_series_for_add(data, forceindex=forceindex)
+        series.name = name
         self._cols[name] = series
 
     def drop_column(self, name):

diff --git a/pygdf/index.py b/pygdf/index.py
@@ -53,6 +53,15 @@ def to_pandas(self):
     def gpu_values(self):
         return self.as_column().to_gpu_array()
 
+    def min(self):
+        return self.as_column().min()
+
+    def max(self):
+        return self.as_column().max()
+
+    def sum(self):
+        return self.as_column().sum()
+
     def find_segments(self):
         """Return the beginning index for segments
 
@@ -100,7 +109,7 @@ def join(self, other, method, how='left', return_indexers=False):
 class RangeIndex(Index):
     """Basic start..stop
     """
-    def __init__(self, start, stop=None):
+    def __init__(self, start, stop=None, name=None):
         """RangeIndex(size), RangeIndex(start, stop)
 
         Parameters
@@ -111,6 +120,7 @@ def __init__(self, start, stop=None):
             start, stop = 0, start
         self._start = int(start)
         self._stop = int(stop)
+        self.name = name
 
     def __repr__(self):
         return "{}(start={}, stop={})".format(self.__class__.__name__,
@@ -183,7 +193,7 @@ def index_from_range(start, stop=None, step=None):
 
 
 class GenericIndex(Index):
-    def __new__(self, values):
+    def __new__(self, values, name=None):
         from .series import Series
 
         # normalize the input
@@ -200,6 +210,7 @@ def __new__(self, values):
         # Make GenericIndex object
         res = Index.__new__(GenericIndex)
         res._values = values
+        res.name = name
         return res
 
     def serialize(self, serialize):
@@ -271,7 +282,7 @@ def find_label_range(self, first, last):
 class DatetimeIndex(GenericIndex):
     # TODO this constructor should take a timezone or something to be
     # consistent with pandas
-    def __new__(self, values):
+    def __new__(self, values, name=None):
         # we should be more strict on what we accept here but
         # we'd have to go and figure out all the semantics around
         # pandas dtindex creation first which.  For now
@@ -285,6 +296,7 @@ def __new__(self, values):
         # override __new__ properly
         res = Index.__new__(DatetimeIndex)
         res._values = values
+        res.name = name
         return res
 
     @property

diff --git a/pygdf/series.py b/pygdf/series.py
@@ -6,6 +6,7 @@
 from numbers import Number
 
 import numpy as np
+import pandas as pd
 
 from . import cudautils, formatting
 from .buffer import Buffer
@@ -58,10 +59,15 @@ def from_masked_array(cls, data, mask, null_count=None):
         col = columnops.as_column(data).set_mask(mask, null_count=null_count)
         return cls(data=col)
 
-    def __init__(self, data, index=None):
+    def __init__(self, data, index=None, name=None):
+        if isinstance(data, pd.Series):
+            name = data.name
+            index = GenericIndex(data.index)
         if isinstance(data, Series):
             index = data._index
+            name = data.name
             data = data._column
+
         if not isinstance(data, columnops.TypedColumnBase):
             data = columnops.as_column(data)
 
@@ -71,6 +77,11 @@ def __init__(self, data, index=None):
         assert isinstance(data, columnops.TypedColumnBase)
         self._column = data
         self._index = RangeIndex(len(data)) if index is None else index
+        self.name = name
+
+    @classmethod
+    def from_pandas(cls, s):
+        return cls(s)
 
     def serialize(self, serialize):
         header = {}
@@ -104,6 +115,7 @@ def _copy_construct_defaults(self):
         return dict(
             data=self._column,
             index=self._index,
+            name=self.name,
         )
 
     def _copy_construct(self, **kwargs):
@@ -306,6 +318,12 @@ def __mul__(self, other):
     def __rmul__(self, other):
         return self._rbinaryop(other, 'mul')
 
+    def __pow__(self, other):
+        if other == 2:
+            return self * self
+        else:
+            return NotImplemented
+
     def __floordiv__(self, other):
         return self._binaryop(other, 'floordiv')
 
@@ -370,8 +388,13 @@ def _concat(cls, objs, index=True):
         if index is True:
             index = Index._concat([o.index for o in objs])
 
+        names = {obj.name for obj in objs}
+        if len(names) == 1:
+            [name] = names
+        else:
+            name = None
         col = Column._concat([o._column for o in objs])
-        return cls(data=col, index=index)
+        return cls(data=col, index=index, name=name)
 
     def append(self, arbitrary):
         """Append values from another ``Series`` or array-like object.
@@ -442,7 +465,9 @@ def to_gpu_array(self, fillna=None):
     def to_pandas(self, index=True):
         if index is True:
             index = self.index.to_pandas()
-        return self._column.to_pandas(index=index)
+        s = self._column.to_pandas(index=index)
+        s.name = self.name
+        return s
 
     @property
     def data(self):
@@ -688,37 +713,44 @@ def find_last_value(self, value):
     #
     # Stats
     #
-    def count(self):
+    def count(self, axis=None, skipna=True):
         """The number of non-null values"""
+        assert axis in (None, 0) and skipna is True
         return self.valid_count
 
-    def min(self):
+    def min(self, axis=None, skipna=True):
         """Compute the min of the series
         """
+        assert axis in (None, 0) and skipna is True
         return self._column.min()
 
-    def max(self):
+    def max(self, axis=None, skipna=True):
         """Compute the max of the series
         """
+        assert axis in (None, 0) and skipna is True
         return self._column.max()
 
-    def sum(self):
+    def sum(self, axis=None, skipna=True):
         """Compute the sum of the series"""
+        assert axis in (None, 0) and skipna is True
         return self._column.sum()
 
-    def mean(self):
+    def mean(self, axis=None, skipna=True):
         """Compute the mean of the series
         """
+        assert axis in (None, 0) and skipna is True
         return self._column.mean()
 
-    def std(self, ddof=1):
+    def std(self, ddof=1, axis=None, skipna=True):
         """Compute the standard deviation of the series
         """
+        assert axis in (None, 0) and skipna is True
         return np.sqrt(self.var(ddof=ddof))
 
-    def var(self, ddof=1):
+    def var(self, ddof=1, axis=None, skipna=True):
         """Compute the variance of the series
         """
+        assert axis in (None, 0) and skipna is True
         mu, var = self.mean_var(ddof=ddof)
         return var
 

diff --git a/pygdf/tests/test_dataframe.py b/pygdf/tests/test_dataframe.py
@@ -123,7 +123,7 @@ def test_dataframe_basic():
     df['vals'] = rnd_vals
     np.testing.assert_equal(df['vals'].to_array(), rnd_vals)
     assert len(df) == 10
-    assert df.columns == ('keys', 'vals')
+    assert tuple(df.columns) == ('keys', 'vals')
 
     # Make another dataframe
     df2 = DataFrame()
@@ -177,13 +177,13 @@ def test_dataframe_column_add_drop():
     data = np.asarray(range(10))
     df['a'] = data
     df['b'] = data
-    assert df.columns == ('a', 'b')
+    assert tuple(df.columns) == ('a', 'b')
     del df['a']
-    assert df.columns == ('b',)
+    assert tuple(df.columns) == ('b',)
     df['c'] = data
-    assert df.columns == ('b', 'c')
+    assert tuple(df.columns) == ('b', 'c')
     df['a'] = data
-    assert df.columns == ('b', 'c', 'a')
+    assert tuple(df.columns) == ('b', 'c', 'a')
 
 
 @pytest.mark.parametrize('nelem', [0, 3, 100, 1000])
@@ -210,7 +210,7 @@ def test_dataframe_slicing():
     # Row slice first 10
     first_10 = df[:10]
     assert len(first_10) == 10
-    assert first_10.columns == tuple(['a', 'b', 'c', 'd'])
+    assert tuple(first_10.columns) == ('a', 'b', 'c', 'd')
     np.testing.assert_equal(first_10['a'].to_array(), ha[:10])
     np.testing.assert_equal(first_10['b'].to_array(), hb[:10])
     np.testing.assert_equal(first_10['c'].to_array(), hc[:10])
@@ -220,7 +220,7 @@ def test_dataframe_slicing():
     # Row slice last 10
     last_10 = df[-10:]
     assert len(last_10) == 10
-    assert last_10.columns == tuple(['a', 'b', 'c', 'd'])
+    assert tuple(last_10.columns) == ('a', 'b', 'c', 'd')
     np.testing.assert_equal(last_10['a'].to_array(), ha[-10:])
     np.testing.assert_equal(last_10['b'].to_array(), hb[-10:])
     np.testing.assert_equal(last_10['c'].to_array(), hc[-10:])
@@ -232,7 +232,7 @@ def test_dataframe_slicing():
     end = 121
     subrange = df[begin:end]
     assert len(subrange) == end - begin
-    assert subrange.columns == tuple(['a', 'b', 'c', 'd'])
+    assert tuple(subrange.columns) == ('a', 'b', 'c', 'd')
     np.testing.assert_equal(subrange['a'].to_array(), ha[begin:end])
     np.testing.assert_equal(subrange['b'].to_array(), hb[begin:end])
     np.testing.assert_equal(subrange['c'].to_array(), hc[begin:end])
@@ -252,14 +252,14 @@ def test_dataframe_loc():
 
     # Full slice
     full = df.loc[:, ['c']]
-    assert full.columns == tuple(['c'])
+    assert tuple(full.columns) == ('c',)
     np.testing.assert_equal(full['c'].to_array(), hc)
 
     begin = 117
     end = 122
     fewer = df.loc[begin:end, ['c', 'd', 'a']]
     assert len(fewer) == end - begin + 1
-    assert fewer.columns == tuple(['c', 'd', 'a'])
+    assert tuple(fewer.columns) == ('c', 'd', 'a')
     np.testing.assert_equal(fewer['a'].to_array(), ha[begin:end + 1])
     np.testing.assert_equal(fewer['c'].to_array(), hc[begin:end + 1])
     np.testing.assert_equal(fewer['d'].to_array(), hd[begin:end + 1])
@@ -272,7 +272,7 @@ def test_dataframe_loc():
     end = 122
     fewer = df2.loc[begin:end, ['c', 'd', 'a']]
     assert len(fewer) == end - begin + 1
-    assert fewer.columns == tuple(['c', 'd', 'a'])
+    assert tuple(fewer.columns) == ('c', 'd', 'a')
     np.testing.assert_equal(fewer['a'].to_array(), ha[begin:end + 1])
     np.testing.assert_equal(fewer['c'].to_array(), hc[begin:end + 1])
     np.testing.assert_equal(fewer['d'].to_array(), hd[begin:end + 1])
@@ -518,6 +518,15 @@ def test_dataframe_setitem_index_len1():
     np.testing.assert_equal(gdf.b.to_array(), [0])
 
 
+def test_assign():
+    gdf = DataFrame({'x': [1, 2, 3]})
+    gdf2 = gdf.assign(y=gdf.x + 1)
+    assert list(gdf.columns) == ['x']
+    assert list(gdf2.columns) == ['x', 'y']
+
+    np.testing.assert_equal(gdf2.y.to_array(), [2, 3, 4])
+
+
 @pytest.mark.parametrize('nrows', [1, 8, 100, 1000])
 def test_dataframe_hash_columns(nrows):
     gdf = DataFrame()
@@ -661,3 +670,26 @@ def do_slice(x):
     got = do_slice(gdf).to_pandas()
 
     pd.testing.assert_frame_equal(expect, got)
+
+
+def test_from_pandas():
+    df = pd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.])
+    gdf = gd.DataFrame.from_pandas(df)
+    assert isinstance(gdf, gd.DataFrame)
+
+    pd.testing.assert_frame_equal(df, gdf.to_pandas())
+
+    s = df.x
+    gs = gd.Series.from_pandas(s)
+    assert isinstance(gs, gd.Series)
+
+    pd.testing.assert_series_equal(s, gs.to_pandas())
+
+
+@pytest.mark.xfail(reason="constructor does not coerce index inputs")
+def test_index_in_dataframe_constructor():
+    a = pd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.])
+    b = gd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.])
+
+    pd.testing.assert_frame_equal(a, b.to_pandas())
+    assert pd.testing.assert_frame_equal(a.loc[4:], b.loc[4:].to_pandas())