Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 1 addition & 7 deletions pygdf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2018, NVIDIA CORPORATION.
from .dataframe import DataFrame
from .index import Index
from .series import Series
from .multi import concat

Expand All @@ -10,10 +11,3 @@
from ._version import get_versions
__version__ = get_versions()['version']
del get_versions

__all__ = [
DataFrame,
Series,
concat,
set_options,
]
15 changes: 13 additions & 2 deletions pygdf/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ def __init__(self, name_series=None, index=None):
self._cols = OrderedDict()
# has initializer?
if name_series is not None:
if isinstance(name_series, dict):
name_series = name_series.items()
for k, series in name_series:
self.add_column(k, series, forceindex=index is not None)

Expand Down Expand Up @@ -176,7 +178,9 @@ def __getitem__(self, arg):
3 3 3
"""
if isinstance(arg, str) or isinstance(arg, int):
return self._cols[arg]
s = self._cols[arg]
s.name = arg
return s
elif isinstance(arg, slice):
df = DataFrame()
for k, col in self._cols.items():
Expand Down Expand Up @@ -213,6 +217,12 @@ def __len__(self):
"""
return self._size

def assign(self, **kwargs):
new = self.copy()
for k, v in kwargs.items():
new[k] = v
return new

def head(self, n=5):
return self[:n]

Expand Down Expand Up @@ -294,7 +304,7 @@ def loc(self):
def columns(self):
"""Returns a tuple of columns
"""
return tuple(self._cols)
return pd.Index(self._cols)

@property
def index(self):
Expand Down Expand Up @@ -415,6 +425,7 @@ def add_column(self, name, data, forceindex=False):
raise NameError('duplicated column name {!r}'.format(name))

series = self._prepare_series_for_add(data, forceindex=forceindex)
series.name = name
self._cols[name] = series

def drop_column(self, name):
Expand Down
18 changes: 15 additions & 3 deletions pygdf/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,15 @@ def to_pandas(self):
def gpu_values(self):
return self.as_column().to_gpu_array()

def min(self):
return self.as_column().min()

def max(self):
return self.as_column().max()

def sum(self):
return self.as_column().sum()

def find_segments(self):
"""Return the beginning index for segments

Expand Down Expand Up @@ -100,7 +109,7 @@ def join(self, other, method, how='left', return_indexers=False):
class RangeIndex(Index):
"""Basic start..stop
"""
def __init__(self, start, stop=None):
def __init__(self, start, stop=None, name=None):
"""RangeIndex(size), RangeIndex(start, stop)

Parameters
Expand All @@ -111,6 +120,7 @@ def __init__(self, start, stop=None):
start, stop = 0, start
self._start = int(start)
self._stop = int(stop)
self.name = name

def __repr__(self):
return "{}(start={}, stop={})".format(self.__class__.__name__,
Expand Down Expand Up @@ -183,7 +193,7 @@ def index_from_range(start, stop=None, step=None):


class GenericIndex(Index):
def __new__(self, values):
def __new__(self, values, name=None):
from .series import Series

# normalize the input
Expand All @@ -200,6 +210,7 @@ def __new__(self, values):
# Make GenericIndex object
res = Index.__new__(GenericIndex)
res._values = values
res.name = name
return res

def serialize(self, serialize):
Expand Down Expand Up @@ -271,7 +282,7 @@ def find_label_range(self, first, last):
class DatetimeIndex(GenericIndex):
# TODO this constructor should take a timezone or something to be
# consistent with pandas
def __new__(self, values):
def __new__(self, values, name=None):
# we should be more strict on what we accept here but
# we'd have to go and figure out all the semantics around
# pandas dtindex creation first which. For now
Expand All @@ -285,6 +296,7 @@ def __new__(self, values):
# override __new__ properly
res = Index.__new__(DatetimeIndex)
res._values = values
res.name = name
return res

@property
Expand Down
52 changes: 42 additions & 10 deletions pygdf/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from numbers import Number

import numpy as np
import pandas as pd

from . import cudautils, formatting
from .buffer import Buffer
Expand Down Expand Up @@ -58,10 +59,15 @@ def from_masked_array(cls, data, mask, null_count=None):
col = columnops.as_column(data).set_mask(mask, null_count=null_count)
return cls(data=col)

def __init__(self, data, index=None):
def __init__(self, data, index=None, name=None):
if isinstance(data, pd.Series):
name = data.name
index = GenericIndex(data.index)
if isinstance(data, Series):
index = data._index
name = data.name
data = data._column

if not isinstance(data, columnops.TypedColumnBase):
data = columnops.as_column(data)

Expand All @@ -71,6 +77,11 @@ def __init__(self, data, index=None):
assert isinstance(data, columnops.TypedColumnBase)
self._column = data
self._index = RangeIndex(len(data)) if index is None else index
self.name = name

@classmethod
def from_pandas(cls, s):
return cls(s)

def serialize(self, serialize):
header = {}
Expand Down Expand Up @@ -104,6 +115,7 @@ def _copy_construct_defaults(self):
return dict(
data=self._column,
index=self._index,
name=self.name,
)

def _copy_construct(self, **kwargs):
Expand Down Expand Up @@ -306,6 +318,12 @@ def __mul__(self, other):
def __rmul__(self, other):
return self._rbinaryop(other, 'mul')

def __pow__(self, other):
if other == 2:
return self * self
else:
return NotImplemented
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is janky. Is there a GPU implementation somewhere that we can use?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mrocklin This is a WIP in rapidsai/libgdf#94


def __floordiv__(self, other):
return self._binaryop(other, 'floordiv')

Expand Down Expand Up @@ -370,8 +388,13 @@ def _concat(cls, objs, index=True):
if index is True:
index = Index._concat([o.index for o in objs])

names = {obj.name for obj in objs}
if len(names) == 1:
[name] = names
else:
name = None
col = Column._concat([o._column for o in objs])
return cls(data=col, index=index)
return cls(data=col, index=index, name=name)

def append(self, arbitrary):
"""Append values from another ``Series`` or array-like object.
Expand Down Expand Up @@ -442,7 +465,9 @@ def to_gpu_array(self, fillna=None):
def to_pandas(self, index=True):
if index is True:
index = self.index.to_pandas()
return self._column.to_pandas(index=index)
s = self._column.to_pandas(index=index)
s.name = self.name
return s

@property
def data(self):
Expand Down Expand Up @@ -688,37 +713,44 @@ def find_last_value(self, value):
#
# Stats
#
def count(self):
def count(self, axis=None, skipna=True):
"""The number of non-null values"""
assert axis in (None, 0) and skipna is True
return self.valid_count

def min(self):
def min(self, axis=None, skipna=True):
"""Compute the min of the series
"""
assert axis in (None, 0) and skipna is True
return self._column.min()

def max(self):
def max(self, axis=None, skipna=True):
"""Compute the max of the series
"""
assert axis in (None, 0) and skipna is True
return self._column.max()

def sum(self):
def sum(self, axis=None, skipna=True):
"""Compute the sum of the series"""
assert axis in (None, 0) and skipna is True
return self._column.sum()

def mean(self):
def mean(self, axis=None, skipna=True):
"""Compute the mean of the series
"""
assert axis in (None, 0) and skipna is True
return self._column.mean()

def std(self, ddof=1):
def std(self, ddof=1, axis=None, skipna=True):
"""Compute the standard deviation of the series
"""
assert axis in (None, 0) and skipna is True
return np.sqrt(self.var(ddof=ddof))

def var(self, ddof=1):
def var(self, ddof=1, axis=None, skipna=True):
"""Compute the variance of the series
"""
assert axis in (None, 0) and skipna is True
mu, var = self.mean_var(ddof=ddof)
return var

Expand Down
54 changes: 43 additions & 11 deletions pygdf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def test_dataframe_basic():
df['vals'] = rnd_vals
np.testing.assert_equal(df['vals'].to_array(), rnd_vals)
assert len(df) == 10
assert df.columns == ('keys', 'vals')
assert tuple(df.columns) == ('keys', 'vals')

# Make another dataframe
df2 = DataFrame()
Expand Down Expand Up @@ -177,13 +177,13 @@ def test_dataframe_column_add_drop():
data = np.asarray(range(10))
df['a'] = data
df['b'] = data
assert df.columns == ('a', 'b')
assert tuple(df.columns) == ('a', 'b')
del df['a']
assert df.columns == ('b',)
assert tuple(df.columns) == ('b',)
df['c'] = data
assert df.columns == ('b', 'c')
assert tuple(df.columns) == ('b', 'c')
df['a'] = data
assert df.columns == ('b', 'c', 'a')
assert tuple(df.columns) == ('b', 'c', 'a')


@pytest.mark.parametrize('nelem', [0, 3, 100, 1000])
Expand All @@ -210,7 +210,7 @@ def test_dataframe_slicing():
# Row slice first 10
first_10 = df[:10]
assert len(first_10) == 10
assert first_10.columns == tuple(['a', 'b', 'c', 'd'])
assert tuple(first_10.columns) == ('a', 'b', 'c', 'd')
np.testing.assert_equal(first_10['a'].to_array(), ha[:10])
np.testing.assert_equal(first_10['b'].to_array(), hb[:10])
np.testing.assert_equal(first_10['c'].to_array(), hc[:10])
Expand All @@ -220,7 +220,7 @@ def test_dataframe_slicing():
# Row slice last 10
last_10 = df[-10:]
assert len(last_10) == 10
assert last_10.columns == tuple(['a', 'b', 'c', 'd'])
assert tuple(last_10.columns) == ('a', 'b', 'c', 'd')
np.testing.assert_equal(last_10['a'].to_array(), ha[-10:])
np.testing.assert_equal(last_10['b'].to_array(), hb[-10:])
np.testing.assert_equal(last_10['c'].to_array(), hc[-10:])
Expand All @@ -232,7 +232,7 @@ def test_dataframe_slicing():
end = 121
subrange = df[begin:end]
assert len(subrange) == end - begin
assert subrange.columns == tuple(['a', 'b', 'c', 'd'])
assert tuple(subrange.columns) == ('a', 'b', 'c', 'd')
np.testing.assert_equal(subrange['a'].to_array(), ha[begin:end])
np.testing.assert_equal(subrange['b'].to_array(), hb[begin:end])
np.testing.assert_equal(subrange['c'].to_array(), hc[begin:end])
Expand All @@ -252,14 +252,14 @@ def test_dataframe_loc():

# Full slice
full = df.loc[:, ['c']]
assert full.columns == tuple(['c'])
assert tuple(full.columns) == ('c',)
np.testing.assert_equal(full['c'].to_array(), hc)

begin = 117
end = 122
fewer = df.loc[begin:end, ['c', 'd', 'a']]
assert len(fewer) == end - begin + 1
assert fewer.columns == tuple(['c', 'd', 'a'])
assert tuple(fewer.columns) == ('c', 'd', 'a')
np.testing.assert_equal(fewer['a'].to_array(), ha[begin:end + 1])
np.testing.assert_equal(fewer['c'].to_array(), hc[begin:end + 1])
np.testing.assert_equal(fewer['d'].to_array(), hd[begin:end + 1])
Expand All @@ -272,7 +272,7 @@ def test_dataframe_loc():
end = 122
fewer = df2.loc[begin:end, ['c', 'd', 'a']]
assert len(fewer) == end - begin + 1
assert fewer.columns == tuple(['c', 'd', 'a'])
assert tuple(fewer.columns) == ('c', 'd', 'a')
np.testing.assert_equal(fewer['a'].to_array(), ha[begin:end + 1])
np.testing.assert_equal(fewer['c'].to_array(), hc[begin:end + 1])
np.testing.assert_equal(fewer['d'].to_array(), hd[begin:end + 1])
Expand Down Expand Up @@ -518,6 +518,15 @@ def test_dataframe_setitem_index_len1():
np.testing.assert_equal(gdf.b.to_array(), [0])


def test_assign():
gdf = DataFrame({'x': [1, 2, 3]})
gdf2 = gdf.assign(y=gdf.x + 1)
assert list(gdf.columns) == ['x']
assert list(gdf2.columns) == ['x', 'y']

np.testing.assert_equal(gdf2.y.to_array(), [2, 3, 4])


@pytest.mark.parametrize('nrows', [1, 8, 100, 1000])
def test_dataframe_hash_columns(nrows):
gdf = DataFrame()
Expand Down Expand Up @@ -661,3 +670,26 @@ def do_slice(x):
got = do_slice(gdf).to_pandas()

pd.testing.assert_frame_equal(expect, got)


def test_from_pandas():
df = pd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.])
gdf = gd.DataFrame.from_pandas(df)
assert isinstance(gdf, gd.DataFrame)

pd.testing.assert_frame_equal(df, gdf.to_pandas())

s = df.x
gs = gd.Series.from_pandas(s)
assert isinstance(gs, gd.Series)

pd.testing.assert_series_equal(s, gs.to_pandas())


@pytest.mark.xfail(reason="constructor does not coerce index inputs")
def test_index_in_dataframe_constructor():
a = pd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.])
b = gd.DataFrame({'x': [1, 2, 3]}, index=[4., 5., 6.])

pd.testing.assert_frame_equal(a, b.to_pandas())
assert pd.testing.assert_frame_equal(a.loc[4:], b.loc[4:].to_pandas())
Loading