diff --git a/pygpu/__init__.py b/pygpu/__init__.py
index 566cd8d19b..44c0766782 100644
--- a/pygpu/__init__.py
+++ b/pygpu/__init__.py
@@ -4,7 +4,7 @@ def get_include():
     assert os.path.exists(os.path.join(p, 'gpuarray_api.h'))
     return p
 
-from . import gpuarray, elemwise, reduction
+from . import gpuarray, elemwise, reduction, ufuncs
 from .gpuarray import (init, set_default_context, get_default_context,
                        array, zeros, empty, asarray, ascontiguousarray,
                        asfortranarray, register_dtype)
diff --git a/pygpu/_array.py b/pygpu/_array.py
index 2fc793f838..ca46c36227 100644
--- a/pygpu/_array.py
+++ b/pygpu/_array.py
@@ -1,9 +1,9 @@
 from __future__ import division
-import numpy as np
+import numpy
 
-from .elemwise import elemwise1, elemwise2, ielemwise2, compare, arg, GpuElemwise, as_argument
-from .reduction import reduce1
-from .dtypes import dtype_to_ctype, get_np_obj, get_common_dtype
+from .elemwise import elemwise1, arg, GpuElemwise, as_argument
+from .dtypes import dtype_to_ctype, get_common_dtype
+from . import ufuncs
 from . import gpuarray
 
 
@@ -19,125 +19,126 @@ class ndgpuarray(gpuarray.GpuArray):
     more like a drop-in replacement for numpy.ndarray than the raw
     GpuArray class.
     """
+    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+        # Unwrap out if necessary
+        out_tuple = kwargs.pop('out', ())
+        if not all(isinstance(out, gpuarray.GpuArray) for out in out_tuple):
+            return NotImplemented
+
+        if len(out_tuple) == 1:
+            kwargs['out'] = out_tuple[0]
+        elif len(out_tuple) == 2:
+            kwargs['out1'] = out_tuple[0]
+            kwargs['out2'] = out_tuple[1]
+
+        native_ufunc = getattr(ufuncs, ufunc.__name__, None)
+        if native_ufunc is None:
+            return NotImplemented
+
+        if method == '__call__':
+            return native_ufunc(*inputs, **kwargs)
+        else:
+            function = getattr(native_ufunc, method, None)
+            return function(*inputs, **kwargs)
+
+    # The special methods call explicitly into __array_ufunc__ to make
+    # sure that the native ufuncs are used also for Numpy 1.12 and below.
+    # The only difference will be the behavior when Numpy ufuncs are
+    # used: for 1.12 and below, numpy.add(x_gpu, y) will cast x_gpu to
+    # a Numpy array, 1.13 and above calls into the native GPU ufunc.
+
     # add
     def __add__(self, other):
-        return elemwise2(self, '+', other, self, broadcast=True)
+        return self.__array_ufunc__(numpy.add, '__call__', self, other)
 
     def __radd__(self, other):
-        return elemwise2(other, '+', self, self, broadcast=True)
+        return self.__array_ufunc__(numpy.add, '__call__', other, self)
 
     def __iadd__(self, other):
-        return ielemwise2(self, '+', other, broadcast=True)
+        return self.__array_ufunc__(numpy.add, '__call__', self, other,
+                                    out=self)
 
     # sub
     def __sub__(self, other):
-        return elemwise2(self, '-', other, self, broadcast=True)
+        return self.__array_ufunc__(numpy.subtract, '__call__', self, other)
 
     def __rsub__(self, other):
-        return elemwise2(other, '-', self, self, broadcast=True)
+        return self.__array_ufunc__(numpy.subtract, '__call__', other, self)
 
     def __isub__(self, other):
-        return ielemwise2(self, '-', other, broadcast=True)
+        return self.__array_ufunc__(numpy.subtract, '__call__', self, other,
+                                    out=self)
 
     # mul
     def __mul__(self, other):
-        return elemwise2(self, '*', other, self, broadcast=True)
+        return self.__array_ufunc__(numpy.multiply, '__call__', self, other)
 
     def __rmul__(self, other):
-        return elemwise2(other, '*', self, self, broadcast=True)
+        return self.__array_ufunc__(numpy.multiply, '__call__', other, self)
 
     def __imul__(self, other):
-        return ielemwise2(self, '*', other, broadcast=True)
+        return self.__array_ufunc__(numpy.multiply, '__call__', self, other,
+                                    out=self)
 
     # div
     def __div__(self, other):
-        return elemwise2(self, '/', other, self, broadcast=True)
+        return self.__array_ufunc__(numpy.divide, '__call__', self, other)
 
     def __rdiv__(self, other):
-        return elemwise2(other, '/', self, self, broadcast=True)
+        return self.__array_ufunc__(numpy.divide, '__call__', other, self)
 
     def __idiv__(self, other):
-        return ielemwise2(self, '/', other, broadcast=True)
+        return self.__array_ufunc__(numpy.divide, '__call__', self, other,
+                                    out=self)
 
     # truediv
     def __truediv__(self, other):
-        np1 = get_np_obj(self)
-        np2 = get_np_obj(other)
-        res = (np1.__truediv__(np2)).dtype
-        return elemwise2(self, '/', other, self, odtype=res, broadcast=True)
+        return self.__array_ufunc__(numpy.true_divide, '__call__',
+                                    self, other)
 
     def __rtruediv__(self, other):
-        np1 = get_np_obj(self)
-        np2 = get_np_obj(other)
-        res = (np2.__truediv__(np1)).dtype
-        return elemwise2(other, '/', self, self, odtype=res, broadcast=True)
+        return self.__array_ufunc__(numpy.true_divide, '__call__',
+                                    other, self)
 
     def __itruediv__(self, other):
-        np2 = get_np_obj(other)
-        kw = {'broadcast': True}
-        if self.dtype == np.float32 or np2.dtype == np.float32:
-            kw['op_tmpl'] = "a = (float)a / (float)b"
-        if self.dtype == np.float64 or np2.dtype == np.float64:
-            kw['op_tmpl'] = "a = (double)a / (double)b"
-        return ielemwise2(self, '/', other, **kw)
+        return self.__array_ufunc__(numpy.true_divide, '__call__',
+                                    self, other, out=self)
 
     # floordiv
     def __floordiv__(self, other):
-        out_dtype = get_common_dtype(self, other, True)
-        kw = {'broadcast': True}
-        if out_dtype.kind == 'f':
-            kw['op_tmpl'] = "res = floor((%(out_t)s)a / (%(out_t)s)b)"
-        return elemwise2(self, '/', other, self, odtype=out_dtype, **kw)
+        return self.__array_ufunc__(numpy.floor_divide, '__call__',
+                                    self, other)
 
     def __rfloordiv__(self, other):
-        out_dtype = get_common_dtype(other, self, True)
-        kw = {'broadcast': True}
-        if out_dtype.kind == 'f':
-            kw['op_tmpl'] = "res = floor((%(out_t)s)a / (%(out_t)s)b)"
-        return elemwise2(other, '/', self, self, odtype=out_dtype, **kw)
+        return self.__array_ufunc__(numpy.floor_divide, '__call__',
+                                    other, self)
 
     def __ifloordiv__(self, other):
-        out_dtype = self.dtype
-        kw = {'broadcast': True}
-        if out_dtype == np.float32:
-            kw['op_tmpl'] = "a = floor((float)a / (float)b)"
-        if out_dtype == np.float64:
-            kw['op_tmpl'] = "a = floor((double)a / (double)b)"
-        return ielemwise2(self, '/', other, **kw)
+        return self.__array_ufunc__(numpy.floor_divide, '__call__',
+                                    self, other, out=self)
 
     # mod
     def __mod__(self, other):
-        out_dtype = get_common_dtype(self, other, True)
-        kw = {'broadcast': True}
-        if out_dtype.kind == 'f':
-            kw['op_tmpl'] = "res = fmod((%(out_t)s)a, (%(out_t)s)b)"
-        return elemwise2(self, '%', other, self, odtype=out_dtype, **kw)
+        return self.__array_ufunc__(numpy.mod, '__call__', self, other)
 
     def __rmod__(self, other):
-        out_dtype = get_common_dtype(other, self, True)
-        kw = {'broadcast': True}
-        if out_dtype.kind == 'f':
-            kw['op_tmpl'] = "res = fmod((%(out_t)s)a, (%(out_t)s)b)"
-        return elemwise2(other, '%', self, self, odtype=out_dtype, **kw)
+        return self.__array_ufunc__(numpy.mod, '__call__', other, self)
 
     def __imod__(self, other):
-        out_dtype = get_common_dtype(self, other, self.dtype == np.float64)
-        kw = {'broadcast': True}
-        if out_dtype == np.float32:
-            kw['op_tmpl'] = "a = fmod((float)a, (float)b)"
-        if out_dtype == np.float64:
-            kw['op_tmpl'] = "a = fmod((double)a, (double)b)"
-        return ielemwise2(self, '%', other, **kw)
+        return self.__array_ufunc__(numpy.mod, '__call__', self, other,
+                                    out=self)
 
     # divmod
     def __divmod__(self, other):
         if not isinstance(other, gpuarray.GpuArray):
-            other = np.asarray(other)
+            other = numpy.asarray(other)
         odtype = get_common_dtype(self, other, True)
 
         a_arg = as_argument(self, 'a', read=True)
         b_arg = as_argument(other, 'b', read=True)
-        args = [arg('div', odtype, write=True), arg('mod', odtype, write=True), a_arg, b_arg]
+        args = [arg('div', odtype, write=True),
+                arg('mod', odtype, write=True),
+                a_arg, b_arg]
 
         div = self._empty_like_me(dtype=odtype)
         mod = self._empty_like_me(dtype=odtype)
@@ -157,12 +158,14 @@ def __divmod__(self, other):
 
     def __rdivmod__(self, other):
         if not isinstance(other, gpuarray.GpuArray):
-            other = np.asarray(other)
+            other = numpy.asarray(other)
         odtype = get_common_dtype(other, self, True)
 
         a_arg = as_argument(other, 'a', read=True)
         b_arg = as_argument(self, 'b', read=True)
-        args = [arg('div', odtype, write=True), arg('mod', odtype, write=True), a_arg, b_arg]
+        args = [arg('div', odtype, write=True),
+                arg('mod', odtype, write=True),
+                a_arg, b_arg]
 
         div = self._empty_like_me(dtype=odtype)
         mod = self._empty_like_me(dtype=odtype)
@@ -180,42 +183,112 @@ def __rdivmod__(self, other):
         k(div, mod, other, self, broadcast=True)
         return (div, mod)
 
+    # unary ops
     def __neg__(self):
-        return elemwise1(self, '-')
+        return self.__array_ufunc__(numpy.negative, '__call__', self)
 
     def __pos__(self):
         return elemwise1(self, '+')
 
     def __abs__(self):
-        if self.dtype.kind == 'u':
-            return self.copy()
-        if self.dtype.kind == 'f':
-            oper = "res = fabs(a)"
-        elif self.dtype.itemsize < 4:
-            # cuda 5.5 finds the c++ stdlib definition if we don't cast here.
-            oper = "res = abs((int)a)"
-        else:
-            oper = "res = abs(a)"
-        return elemwise1(self, None, oper=oper)
+        return self.__array_ufunc__(numpy.abs, '__call__', self)
+
+    def __invert__(self):
+        return self.__array_ufunc__(numpy.invert, '__call__', self)
 
     # richcmp
     def __lt__(self, other):
-        return compare(self, '<', other, broadcast=True)
+        return self.__array_ufunc__(numpy.less, '__call__', self, other)
 
     def __le__(self, other):
-        return compare(self, '<=', other, broadcast=True)
+        return self.__array_ufunc__(numpy.less_equal, '__call__', self, other)
 
     def __eq__(self, other):
-        return compare(self, '==', other, broadcast=True)
+        return self.__array_ufunc__(numpy.equal, '__call__', self, other)
 
     def __ne__(self, other):
-        return compare(self, '!=', other, broadcast=True)
+        return self.__array_ufunc__(numpy.not_equal, '__call__', self, other)
+
+    def __gt__(self, other):
+        return self.__array_ufunc__(numpy.greater, '__call__', self, other)
 
     def __ge__(self, other):
-        return compare(self, '>=', other, broadcast=True)
+        return self.__array_ufunc__(numpy.greater_equal, '__call__', self,
+                                    other)
 
-    def __gt__(self, other):
-        return compare(self, '>', other, broadcast=True)
+    # pow
+    # TODO: pow can take a third modulo argument
+    def __pow__(self, other):
+        return self.__array_ufunc__(numpy.power, '__call__', self, other)
+
+    def __rpow__(self, other):
+        return self.__array_ufunc__(numpy.power, '__call__', other, self)
+
+    def __ipow__(self, other):
+        return self.__array_ufunc__(numpy.power, '__call__', self, other,
+                                    out=self)
+
+    # shifts
+    def __lshift__(self, other):
+        return self.__array_ufunc__(numpy.left_shift, '__call__',
+                                    self, other)
+
+    def __rlshift__(self, other):
+        return self.__array_ufunc__(numpy.left_shift, '__call__',
+                                    other, self)
+
+    def __ilshift__(self, other):
+        return self.__array_ufunc__(numpy.left_shift, '__call__',
+                                    self, other, out=self)
+
+    def __rshift__(self, other):
+        return self.__array_ufunc__(numpy.right_shift, '__call__',
+                                    self, other)
+
+    def __rrshift__(self, other):
+        return self.__array_ufunc__(numpy.right_shift, '__call__',
+                                    other, self)
+
+    def __irshift__(self, other):
+        return self.__array_ufunc__(numpy.right_shift, '__call__',
+                                    self, other, out=self)
+
+    # logical ops
+    def __and__(self, other):
+        return self.__array_ufunc__(numpy.logical_and, '__call__',
+                                    self, other)
+
+    def __rand__(self, other):
+        return self.__array_ufunc__(numpy.logical_and, '__call__',
+                                    other, self)
+
+    def __iand__(self, other):
+        return self.__array_ufunc__(numpy.logical_and, '__call__',
+                                    self, other, out=self)
+
+    def __or__(self, other):
+        return self.__array_ufunc__(numpy.logical_or, '__call__',
+                                    self, other)
+
+    def __ror__(self, other):
+        return self.__array_ufunc__(numpy.logical_or, '__call__',
+                                    other, self)
+
+    def __ior__(self, other):
+        return self.__array_ufunc__(numpy.logical_or, '__call__',
+                                    self, other, out=self)
+
+    def __xor__(self, other):
+        return self.__array_ufunc__(numpy.logical_xor, '__call__',
+                                    self, other)
+
+    def __rxor__(self, other):
+        return self.__array_ufunc__(numpy.logical_xor, '__call__',
+                                    other, self)
+
+    def __ixor__(self, other):
+        return self.__array_ufunc__(numpy.logical_xor, '__call__',
+                                    self, other, out=self)
 
     # misc other things
     @property
@@ -235,60 +308,20 @@ def fill(self, value):
         self[...] = value
 """
     # reductions
-    def all(self, axis=None, out=None):
-        if self.ndim == 0:
-            return self.copy()
-        return reduce1(self, '&&', '1', np.dtype('bool'),
-                       axis=axis, out=out)
-
-    def any(self, axis=None, out=None):
-        if self.ndim == 0:
-            return self.copy()
-        return reduce1(self, '||', '0', np.dtype('bool'),
-                       axis=axis, out=out)
-
-    def prod(self, axis=None, dtype=None, out=None):
-        if dtype is None:
-            dtype = self.dtype
-            # we only upcast integers that are smaller than the plaform default
-            if dtype.kind == 'i':
-                di = np.dtype('int')
-                if di.itemsize > dtype.itemsize:
-                    dtype = di
-            if dtype.kind == 'u':
-                di = np.dtype('uint')
-                if di.itemsize > dtype.itemsize:
-                    dtype = di
-        return reduce1(self, '*', '1', dtype, axis=axis, out=out)
-
-#    def max(self, axis=None, out=None);
-#        nd = self.ndim
-#        if nd == 0:
-#            return self.copy()
-#        idx = (0,) * nd
-#        n = str(self.__getitem__(idx).__array__())
-#        return reduce1(self, '', n, self.dtype, axis=axis, out=out,
-#                       oper='max(a, b)')
-
-#    def min(self, axis=None, out=None):
-#        nd = self.ndim
-#        if nd == 0:
-#            return self.copy()
-#        idx = (0,) * nd
-#        n = str(self.__getitem__(idx).__array__())
-#        return reduce1(self, '', n, self.dtype, axis=axis, out=out,
-#                       oper='min(a, b)')
-
-    def sum(self, axis=None, dtype=None, out=None):
-        if dtype is None:
-            dtype = self.dtype
-            # we only upcast integers that are smaller than the plaform default
-            if dtype.kind == 'i':
-                di = np.dtype('int')
-                if di.itemsize > dtype.itemsize:
-                    dtype = di
-            if dtype.kind == 'u':
-                di = np.dtype('uint')
-                if di.itemsize > dtype.itemsize:
-                    dtype = di
-        return reduce1(self, '+', '0', dtype, axis=axis, out=out)
+    def all(self, axis=None, out=None, keepdims=False):
+        return ufuncs.all(self, axis, out, keepdims)
+
+    def any(self, axis=None, out=None, keepdims=False):
+        return ufuncs.any(self, axis, out, keepdims)
+
+    def prod(self, axis=None, out=None, keepdims=False):
+        return ufuncs.prod(self, axis, out, keepdims)
+
+    def sum(self, axis=None, out=None, keepdims=False):
+        return ufuncs.sum(self, axis, out, keepdims)
+
+    def min(self, axis=None, out=None, keepdims=False):
+        return ufuncs.amin(self, axis, out, keepdims)
+
+    def max(self, axis=None, out=None, keepdims=False):
+        return ufuncs.amax(self, axis, out, keepdims)
diff --git a/pygpu/tests/test_ufuncs.py b/pygpu/tests/test_ufuncs.py
new file mode 100644
index 0000000000..4246565963
--- /dev/null
+++ b/pygpu/tests/test_ufuncs.py
@@ -0,0 +1,569 @@
+from nose.tools import assert_raises
+import numpy
+
+import pygpu
+from pygpu.dtypes import NAME_TO_DTYPE
+from pygpu import ufuncs
+from pygpu.ufuncs import (
+    UNARY_UFUNCS, UNARY_UFUNCS_TWO_OUT, BINARY_UFUNCS, BINARY_UFUNC_TO_C_CMP,
+    BINARY_UFUNC_TO_C_BINOP, BINARY_UFUNC_TO_C_FUNC)
+from pygpu.tests.support import context
+
+
+pygpu.set_default_context(context)
+numpy.seterr(invalid='ignore', divide='ignore')  # avoid stdout warnings
+
+
+# --- Helper functions and global definitions --- #
+
+def npy_and_gpuary_arrays(shape, dtype, positive=False):
+    """Return Numpy and GPU arrays of given shape and dtype."""
+    if dtype == numpy.bool:
+        npy_arr = numpy.random.randint(0, 1, size=shape, dtype=dtype)
+    elif numpy.issubsctype(dtype, numpy.unsignedinteger):
+        npy_arr = numpy.random.randint(1, 10, size=shape, dtype=dtype)
+    elif numpy.issubsctype(dtype, numpy.integer):
+        low = 1 if positive else -10
+        npy_arr = numpy.random.randint(low, 10, size=shape, dtype=dtype)
+    elif numpy.issubsctype(dtype, numpy.floating):
+        npy_arr = numpy.random.normal(size=shape).astype(dtype)
+        if positive:
+            npy_arr = numpy.abs(npy_arr) + 0.1
+    elif numpy.issubsctype(dtype, numpy.complexfloating):
+        npy_arr_re = numpy.random.normal(size=shape).astype(dtype)
+        npy_arr_im = numpy.random.normal(size=shape).astype(dtype)
+        npy_arr = npy_arr_re + 1j * npy_arr_im
+        if positive:
+            npy_arr = (numpy.abs(npy_arr) + 0.1).astype(dtype)
+    else:
+        assert False
+
+    gpuary_arr = pygpu.array(npy_arr, dtype=dtype)
+    return npy_arr, gpuary_arr
+
+
+reductions = ['sum', 'prod', 'amin', 'amax']
+axis_params = [None, 0, 2, (1, 2), (0,), (0, 1), (0, 1, 2)]
+dtypes = set(NAME_TO_DTYPE.values())
+
+# --- Ufuncs & Reductions --- #
+
+
+def test_reduction():
+    for reduction in reductions:
+        for axis in axis_params:
+            for keepdims in [True, False]:
+                for dtype in dtypes:
+                    yield check_reduction, reduction, dtype, axis, keepdims
+
+
+def check_reduction(reduction, dtype, axis, keepdims):
+    """Test GpuArray reduction against equivalent Numpy result."""
+    gpuary_reduction = getattr(ufuncs, reduction)
+    npy_reduction = getattr(numpy, reduction)
+
+    npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3, 4),
+                                                dtype=dtype)
+    # Determine relative tolerance from dtype
+    try:
+        res = numpy.finfo(dtype).resolution
+    except ValueError:
+        res = 0
+    rtol = 2 * npy_arr.size * res
+
+    if (numpy.issubsctype(dtype, numpy.complexfloating) and
+            reduction in ('amin', 'amax')):
+        with assert_raises(ValueError):
+            gpuary_reduction(gpuary_arr, axis=axis, keepdims=keepdims)
+        return
+
+    # No explicit out dtype
+    npy_result = npy_reduction(npy_arr, axis=axis, keepdims=keepdims)
+    gpuary_result = gpuary_reduction(gpuary_arr, axis=axis,
+                                     keepdims=keepdims)
+    if numpy.isscalar(npy_result):
+        assert numpy.isscalar(gpuary_result)
+        assert numpy.isclose(gpuary_result, npy_result, rtol=rtol,
+                             atol=rtol)
+    else:
+        assert npy_result.shape == gpuary_result.shape
+        assert npy_result.dtype == gpuary_result.dtype
+        assert numpy.allclose(gpuary_result, npy_result, rtol=rtol,
+                              atol=rtol)
+
+    # With out array
+    out = pygpu.empty(npy_result.shape, npy_result.dtype)
+    gpuary_result = gpuary_reduction(gpuary_arr, axis=axis, out=out,
+                                     keepdims=keepdims)
+    assert numpy.allclose(out, npy_result, rtol=rtol, atol=rtol)
+    assert out is gpuary_result
+
+    # Explicit out dtype, supported by some reductions only
+    if reduction in ('sum', 'prod'):
+        out_dtype = numpy.promote_types(dtype, numpy.float32)
+        gpuary_result = gpuary_reduction(gpuary_arr, axis=axis,
+                                         dtype=out_dtype,
+                                         keepdims=keepdims)
+        assert gpuary_result.dtype == out_dtype
+        assert numpy.allclose(out, npy_result, rtol=rtol, atol=rtol)
+
+    # Using numpy arrays as input
+    gpuary_result = gpuary_reduction(npy_arr, axis=axis, keepdims=keepdims)
+    if numpy.isscalar(npy_result):
+        assert numpy.isscalar(gpuary_result)
+        assert numpy.isclose(gpuary_result, npy_result, rtol=rtol,
+                             atol=rtol)
+    else:
+        assert npy_result.shape == gpuary_result.shape
+        assert npy_result.dtype == gpuary_result.dtype
+        assert numpy.allclose(gpuary_result, npy_result, rtol=rtol,
+                              atol=rtol)
+
+
+# Dictionary of failing combinations.
+# Reasons for failure:
+# - reciprocal: Numpy ufunc is equivalent to logical_not for bool, no idea why
+# - sign: fails for Numpy, too, since there's no valid signature
+# - fmod: division by 0 handled differently
+# - spacing: Numpy upcasts to float16 for small int types, we can't do that yet
+fail_unary = {'reciprocal': [bool],
+              'sign': [bool],
+              'fmod': [bool],
+              'spacing': [bool, numpy.int8, numpy.uint8],
+              }
+
+
+def test_unary_ufuncs():
+    for ufunc in UNARY_UFUNCS:
+        for dtype in dtypes:
+            if dtype not in fail_unary.get(ufunc, []):
+                yield check_unary_ufunc, ufunc, dtype
+
+
+def check_unary_ufunc(ufunc, dtype):
+    """Test GpuArray unary ufuncs against equivalent Numpy results."""
+    gpuary_ufunc = getattr(ufuncs, ufunc)
+    npy_ufunc = getattr(numpy, ufunc)
+
+    npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3), dtype=dtype,
+                                                positive=True)
+    try:
+        res = numpy.finfo(dtype).resolution
+    except ValueError:
+        res = numpy.finfo(numpy.promote_types(dtype,
+                                              numpy.float16)).resolution
+    rtol = 10 * res
+
+    try:
+        npy_result = npy_ufunc(npy_arr)
+    except TypeError:
+        # Make sure we raise the same error as Numpy
+        with assert_raises(TypeError):
+            gpuary_ufunc(gpuary_arr)
+    else:
+        if npy_result.dtype == numpy.dtype('float16'):
+            # We use float32 as minimum for GPU arrays, do the same here
+            npy_result = npy_result.astype('float32')
+
+        gpuary_result = gpuary_ufunc(gpuary_arr)
+        assert npy_result.shape == gpuary_result.shape
+        assert npy_result.dtype == gpuary_result.dtype
+        assert numpy.allclose(gpuary_result, npy_result, rtol=rtol,
+                              atol=rtol, equal_nan=True)
+
+        # In-place
+        out = pygpu.empty(npy_result.shape, npy_result.dtype)
+        gpuary_ufunc(gpuary_arr, out)
+        assert numpy.allclose(out, npy_result, rtol=rtol,
+                              equal_nan=True)
+
+
+def test_unary_ufuncs_two_out():
+    for ufunc in UNARY_UFUNCS_TWO_OUT:
+        for dtype in dtypes:
+            if dtype not in fail_unary.get(ufunc, []):
+                yield check_unary_ufunc_two_out, ufunc, dtype
+
+
+def check_unary_ufunc_two_out(ufunc, dtype):
+    """Test GpuArray unary ufuncs with two outputs against Numpy."""
+    gpuary_ufunc = getattr(ufuncs, ufunc)
+    npy_ufunc = getattr(numpy, ufunc)
+
+    npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3), dtype=dtype,
+                                                positive=True)
+    try:
+        res = numpy.finfo(dtype).resolution
+    except ValueError:
+        res = numpy.finfo(numpy.promote_types(dtype,
+                                              numpy.float16)).resolution
+    rtol = 10 * res
+
+    try:
+        npy_result1, npy_result2 = npy_ufunc(npy_arr)
+    except TypeError:
+        # Make sure we raise the same error as Numpy
+        with assert_raises(TypeError):
+            gpuary_ufunc(gpuary_arr)
+    else:
+        if npy_result1.dtype == numpy.dtype('float16'):
+            # We use float32 as minimum for GPU arrays, do the same here
+            npy_result1 = npy_result1.astype('float32')
+        if npy_result2.dtype == numpy.dtype('float16'):
+            npy_result2 = npy_result2.astype('float32')
+
+        gpuary_result1, gpuary_result2 = gpuary_ufunc(gpuary_arr)
+        assert npy_result1.shape == gpuary_result1.shape
+        assert npy_result2.shape == gpuary_result2.shape
+        assert npy_result1.dtype == gpuary_result1.dtype
+        assert npy_result2.dtype == gpuary_result2.dtype
+        assert numpy.allclose(gpuary_result1, npy_result1, rtol=rtol,
+                              atol=rtol, equal_nan=True)
+        assert numpy.allclose(gpuary_result2, npy_result2, rtol=rtol,
+                              atol=rtol, equal_nan=True)
+
+        # In-place
+        out1 = pygpu.empty(npy_result1.shape, npy_result1.dtype)
+        out2 = pygpu.empty(npy_result2.shape, npy_result2.dtype)
+        gpuary_ufunc(gpuary_arr, out1, out2)
+        assert numpy.allclose(out1, npy_result1, rtol=rtol, equal_nan=True)
+        assert numpy.allclose(out2, npy_result2, rtol=rtol, equal_nan=True)
+
+
+sint_dtypes = [dt for dt in NAME_TO_DTYPE.values()
+               if numpy.issubsctype(dt, numpy.signedinteger)]
+uint_dtypes = [dt for dt in NAME_TO_DTYPE.values()
+               if numpy.issubsctype(dt, numpy.unsignedinteger)]
+int_dtypes = sint_dtypes + uint_dtypes
+float_dtypes = [dt for dt in NAME_TO_DTYPE.values()
+                if numpy.issubsctype(dt, numpy.floating)]
+# Dictionary of failing combinations.
+# If a dtype key is not present, it is equivalent to "all dtypes".
+# Reasons for failure:
+# - fmod: division by zero handled differently
+# - nextafter: for bool, Numpy upcasts to float16 only, we cast to float32
+# - ldexp: Numpy upcasts to float32, we to float64
+# - left_shift: Numpy wraps around (True << -3 is huge), our code doesn't
+# - floor_divide: division by zero handled differently
+# - logical_xor: our code yields True for True ^ 0.1 due to rounding
+# - remainder: division by zero handled differently
+# - logaddexp: Numpy upcasts to float32, we to float64
+# - logaddexp2: Numpy upcasts to float32, we to float64
+fail_binary = {'fmod': {'dtype2': [bool]},  # wrong where array2 is 0
+               'nextafter': {'dtype1': [bool], 'dtype2': int_dtypes},
+               'ldexp': {'dtype2': [numpy.uint16]},
+               'left_shift': {'dtype2': sint_dtypes},
+               'floor_divide': {'dtype2': [bool]},
+               'logical_xor': {'dtype1': [bool], 'dtype2': float_dtypes},
+               'remainder': {'dtype2': [bool]},
+               'logaddexp': {'dtype2': [numpy.uint16]},
+               'logaddexp2': {'dtype2': [numpy.uint16]},
+               }
+# ufuncs where negative scalars trigger upcasting that differs from Numpy
+upcast_wrong_neg = ('copysign', 'ldexp', 'hypot', 'arctan2', 'nextafter')
+
+
+def test_binary_ufuncs():
+    for ufunc in BINARY_UFUNCS:
+        for dtype1 in dtypes:
+            for dtype2 in dtypes:
+                if (ufunc in fail_binary and
+                    dtype1 in fail_binary[ufunc].get('dtype1', [dtype1]) and
+                        dtype2 in fail_binary[ufunc].get('dtype2', [dtype2])):
+                    pass
+                else:
+                    yield check_binary_ufunc, ufunc, dtype1, dtype2
+
+                scalars_left = [-2, -1, -0.5, 0, 1, 2.5, 4]
+                scalars_right = list(scalars_left)
+
+                # For scalars, we need to exclude some cases
+
+                # Obvious things first: no invalid scalars for the given dtype
+                if numpy.issubsctype(dtype1, numpy.integer):
+                    scalars_left = [x for x in scalars_left if int(x) == x]
+                if numpy.issubsctype(dtype2, numpy.integer):
+                    scalars_right = [x for x in scalars_right if int(x) == x]
+                if numpy.issubsctype(dtype1, numpy.unsignedinteger):
+                    scalars_left = [x for x in scalars_left if x >= 0]
+                if numpy.issubsctype(dtype2, numpy.unsignedinteger):
+                    scalars_right = [x for x in scalars_right if x >= 0]
+
+                # Special treatment for some ufuncs
+                if ufunc == 'power':
+                    if (numpy.issubsctype(dtype1, numpy.integer) or
+                            dtype1 == bool):
+                        # Negative integer power of integer is invalid
+                        scalars_right = [x for x in scalars_right if x >= 0]
+                    if numpy.issubsctype(dtype2, numpy.signedinteger):
+                        # Negative integer power of integer is invalid
+                        scalars_left = [x for x in scalars_right if x >= 0]
+                if ufunc in ('fmod', 'remainder', 'floor_divide'):
+                    # These ufuncs divide by the right operand, so we remove 0
+                    scalars_left = [x for x in scalars_left if x >= 0]
+                if ufunc == 'copysign':
+                    if numpy.issubsctype(dtype2, numpy.unsignedinteger):
+                        # Don't try to copy negative sign to unsigned array
+                        scalars_left = [x for x in scalars_left if x >= 0]
+                if ufunc == 'ldexp':
+                    # Only integers as second argument
+                    scalars_right = [x for x in scalars_right if int(x) == x]
+                    if numpy.issubsctype(dtype2, numpy.unsignedinteger):
+                        # This is not allowed due to invalid type coercion
+                        scalars_left = []
+
+                # Remove negative scalars for ufuncs with differing upcasting
+                if ufunc in upcast_wrong_neg:
+                    scalars_left = [x for x in scalars_left if x >= 0]
+                    scalars_right = [x for x in scalars_left if x >= 0]
+
+                for scalar in scalars_left:
+                    if (ufunc in fail_binary and
+                            fail_binary[ufunc].get('dtype2', [dtype2])):
+                        pass
+                    else:
+                        yield (check_binary_ufunc_scalar_left,
+                               ufunc, scalar, dtype2)
+
+                for scalar in scalars_right:
+                    if (ufunc in fail_binary and
+                            fail_binary[ufunc].get('dtype1', [dtype1])):
+                        pass
+                    else:
+                        yield (check_binary_ufunc_scalar_right,
+                               ufunc, scalar, dtype1)
+
+
+def check_binary_ufunc(ufunc, dtype1, dtype2):
+    """Test GpuArray binary ufunc against equivalent Numpy result."""
+    gpuary_ufunc = getattr(ufuncs, ufunc)
+    npy_ufunc = getattr(numpy, ufunc)
+
+    npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3),
+                                                dtype=dtype1,
+                                                positive=True)
+    npy_arr2, gpuary_arr2 = npy_and_gpuary_arrays(shape=(2, 3),
+                                                  dtype=dtype2,
+                                                  positive=True)
+    try:
+        res = numpy.finfo(numpy.result_type(dtype1, dtype2)).resolution
+    except ValueError:
+        res = numpy.finfo(numpy.result_type(numpy.float16,
+                                            dtype1, dtype2)).resolution
+    rtol = 10 * res
+
+    try:
+        npy_result = npy_ufunc(npy_arr, npy_arr2)
+    except TypeError:
+        # Make sure we raise the same error as Numpy
+        with assert_raises(TypeError):
+            gpuary_ufunc(gpuary_arr, gpuary_arr2)
+    else:
+        if npy_result.dtype == numpy.dtype('float16'):
+            # We use float32 as minimum for GPU arrays, do the same here
+            npy_result = npy_result.astype('float32')
+
+        gpuary_result = gpuary_ufunc(gpuary_arr, gpuary_arr2)
+        assert npy_result.shape == gpuary_result.shape
+        assert npy_result.dtype == gpuary_result.dtype
+        assert numpy.allclose(gpuary_result, npy_result, rtol=rtol,
+                              atol=rtol, equal_nan=True)
+
+        # In-place
+        out = pygpu.empty(npy_result.shape, npy_result.dtype)
+        gpuary_ufunc(gpuary_arr, gpuary_arr2, out)
+        assert numpy.allclose(out, npy_result, rtol=rtol,
+                              equal_nan=True)
+
+
+def check_binary_ufunc_scalar_left(ufunc, scalar, dtype):
+    """Test GpuArray binary ufunc with scalar first operand."""
+    gpuary_ufunc = getattr(ufuncs, ufunc)
+    npy_ufunc = getattr(numpy, ufunc)
+
+    npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3),
+                                                dtype=dtype,
+                                                positive=True)
+    try:
+        res = numpy.finfo(dtype).resolution
+    except ValueError:
+        res = numpy.finfo(numpy.promote_types(dtype,
+                                              numpy.float16)).resolution
+    rtol = 10 * res
+
+    try:
+        npy_result = npy_ufunc(scalar, npy_arr)
+    except TypeError:
+        # Make sure we raise the same error as Numpy
+        with assert_raises(TypeError):
+            gpuary_ufunc(gpuary_arr, scalar)
+    else:
+        if npy_result.dtype == numpy.dtype('float16'):
+            # We use float32 as minimum for GPU arrays, do the same here
+            npy_result = npy_result.astype('float32')
+
+        gpuary_result = gpuary_ufunc(scalar, gpuary_arr)
+        assert npy_result.shape == gpuary_result.shape
+        assert npy_result.dtype == gpuary_result.dtype
+        assert numpy.allclose(gpuary_result, npy_result, rtol=rtol,
+                              atol=rtol, equal_nan=True)
+
+
+def check_binary_ufunc_scalar_right(ufunc, scalar, dtype):
+    """Test GpuArray binary ufunc with scalar second operand."""
+    gpuary_ufunc = getattr(ufuncs, ufunc)
+    npy_ufunc = getattr(numpy, ufunc)
+
+    npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3),
+                                                dtype=dtype,
+                                                positive=True)
+    try:
+        res = numpy.finfo(dtype).resolution
+    except ValueError:
+        res = numpy.finfo(numpy.promote_types(dtype,
+                                              numpy.float16)).resolution
+    rtol = 10 * res
+
+    try:
+        npy_result = npy_ufunc(npy_arr, scalar)
+    except TypeError:
+        # Make sure we raise the same error as Numpy
+        with assert_raises(TypeError):
+            gpuary_ufunc(gpuary_arr, scalar)
+    else:
+        if npy_result.dtype == numpy.dtype('float16'):
+            # We use float32 as minimum for GPU arrays, do the same here
+            npy_result = npy_result.astype('float32')
+
+        gpuary_result = gpuary_ufunc(gpuary_arr, scalar)
+        assert npy_result.shape == gpuary_result.shape
+        assert npy_result.dtype == gpuary_result.dtype
+        assert numpy.allclose(gpuary_result, npy_result, rtol=rtol,
+                              atol=rtol, equal_nan=True)
+
+
+def test_binary_ufuncs_reduce():
+    for ufunc in BINARY_UFUNC_TO_C_FUNC:
+        # Only one axis parameter can be used since it's not reorderable
+        for axis in [0, 1, 2]:
+            for keepdims in [True, False]:
+                if (ufunc in fail_binary and
+                    'float32' in fail_binary[ufunc].get('dtype1',
+                                                        ['float32']) and
+                        'float32' in fail_binary[ufunc].get('dtype2',
+                                                            ['float32'])):
+                    pass
+                else:
+                    yield check_binary_ufunc_reduce, ufunc, axis, keepdims
+
+    for ufunc in BINARY_UFUNC_TO_C_BINOP:
+        for axis in axis_params:
+            for keepdims in [True, False]:
+                if (ufunc in fail_binary and
+                    'float32' in fail_binary[ufunc].get('dtype1',
+                                                        ['float32']) and
+                        'float32' in fail_binary[ufunc].get('dtype2',
+                                                            ['float32'])):
+                    pass
+                elif ufunc.startswith('bitwise') or ufunc.endswith('shift'):
+                    # Invalid source error from CUDA
+                    pass
+                elif ufunc.startswith('logical'):
+                    # Wrong resulting dtype (float instead of bool)
+                    pass
+                elif ufunc in ('subtract',):
+                    # Not reorderable, thus only applicable along one axis
+                    pass
+                else:
+                    yield check_binary_ufunc_reduce, ufunc, axis, keepdims
+
+    for ufunc in BINARY_UFUNC_TO_C_CMP:
+        for axis in axis_params:
+            for keepdims in [True, False]:
+                if (ufunc in fail_binary and
+                    'float32' in fail_binary[ufunc].get('dtype1',
+                                                        ['float32']) and
+                        'float32' in fail_binary[ufunc].get('dtype2',
+                                                            ['float32'])):
+                    pass
+                elif ufunc in ('equal', 'not_equal',
+                               'greater', 'greater_equal',
+                               'less', 'less_equal'):
+                    # Not reorderable, thus only applicable along one axis
+                    pass
+                else:
+                    yield check_binary_ufunc_reduce, ufunc, axis, keepdims
+
+
+def check_binary_ufunc_reduce(ufunc, axis, keepdims):
+    """Test GpuArray ufunc.reduce against equivalent Numpy result."""
+    gpuary_ufunc = getattr(ufuncs, ufunc)
+    npy_ufunc = getattr(numpy, ufunc)
+
+    npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3, 4),
+                                                dtype='float32')
+    # Determine relative tolerance from dtype
+    try:
+        res = numpy.finfo('float32').resolution
+    except ValueError:
+        res = 0
+    rtol = 2 * npy_arr.size * res
+
+    # No explicit out dtype
+    try:
+        npy_result = npy_ufunc.reduce(npy_arr, axis=axis, keepdims=keepdims)
+    except TypeError:
+        # Make sure missing signature produces a TypeError, then bail out
+        with assert_raises(TypeError):
+            gpuary_result = gpuary_ufunc.reduce(gpuary_arr, axis=axis,
+                                                keepdims=keepdims)
+            # Emulate the TypeError if there is no native implementation
+            # and Numpy raises
+            if gpuary_result is NotImplemented:
+                raise TypeError
+        return
+
+    gpuary_result = gpuary_ufunc.reduce(gpuary_arr, axis=axis,
+                                        keepdims=keepdims)
+    if gpuary_result is NotImplemented:
+        # Nothing to check
+        return
+
+    if numpy.isscalar(npy_result):
+        assert numpy.isscalar(gpuary_result)
+        assert numpy.isclose(gpuary_result, npy_result, rtol=rtol,
+                             atol=rtol, equal_nan=True)
+    else:
+        assert npy_result.shape == gpuary_result.shape
+        assert npy_result.dtype == gpuary_result.dtype
+        assert numpy.allclose(gpuary_result, npy_result, rtol=rtol,
+                              atol=rtol, equal_nan=True)
+
+    # With out array
+    out = pygpu.empty(npy_result.shape, npy_result.dtype)
+    gpuary_result = gpuary_ufunc.reduce(gpuary_arr, axis=axis, out=out,
+                                        keepdims=keepdims)
+    assert numpy.allclose(out, npy_result, rtol=rtol, atol=rtol,
+                          equal_nan=True)
+    assert out is gpuary_result
+
+    # Explicit out dtype, supported by some reductions only
+    if ufunc in BINARY_UFUNC_TO_C_BINOP or ufunc in BINARY_UFUNC_TO_C_FUNC:
+        gpuary_result = gpuary_ufunc.reduce(gpuary_arr, axis=axis,
+                                            dtype='float64',
+                                            keepdims=keepdims)
+        assert gpuary_result.dtype == 'float64'
+        assert numpy.allclose(out, npy_result, rtol=rtol, atol=rtol,
+                              equal_nan=True)
+
+    # Using numpy arrays as input
+    gpuary_result = gpuary_ufunc.reduce(npy_arr, axis=axis, keepdims=keepdims)
+    if numpy.isscalar(npy_result):
+        assert numpy.isscalar(gpuary_result)
+        assert numpy.isclose(gpuary_result, npy_result, rtol=rtol,
+                             atol=rtol, equal_nan=True)
+    else:
+        assert npy_result.shape == gpuary_result.shape
+        assert npy_result.dtype == gpuary_result.dtype
+        assert numpy.allclose(gpuary_result, npy_result, rtol=rtol,
+                              atol=rtol, equal_nan=True)
diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
new file mode 100644
index 0000000000..905c7dd853
--- /dev/null
+++ b/pygpu/ufuncs.py
@@ -0,0 +1,1320 @@
+﻿"""Ufuncs and reductions for GPU arrays."""
+
+import mako
+import numpy
+from pkg_resources import parse_version
+import sys
+import warnings
+from pygpu.dtypes import dtype_to_ctype, NAME_TO_DTYPE
+from pygpu._elemwise import arg
+from pygpu.elemwise import as_argument, GpuElemwise
+from pygpu.reduction import reduce1
+from pygpu.gpuarray import GpuArray, array, empty, get_default_context
+
+PY3 = sys.version_info.major >= 3
+
+# Save for later use since the original names are used for reductions
+builtin_all = all
+builtin_any = any
+
+
+# --- Helper functions --- #
+
+
+def restore_reduced_dims(shape, red_axes):
+    """Return tuple from ``shape`` with size-1 axes at indices ``red_axes``."""
+    newshape = list(shape)
+    try:
+        for ax in sorted(red_axes):
+            newshape.insert(ax, 1)
+    except TypeError:
+        newshape.insert(red_axes, 1)
+    return tuple(newshape)
+
+
+def reduce_dims(shape, red_axes):
+    """Return tuple from ``shape`` with ``red_axes`` removed."""
+    newshape = list(shape)
+    try:
+        for ax in reversed(sorted(red_axes)):
+            newshape.pop(ax)
+    except TypeError:
+        newshape.pop(red_axes)
+    return tuple(newshape)
+
+
+# --- Reductions with arithmetic operators --- #
+
+
+def _prepare_array_for_reduction(a, out, context=None):
+    """Return input array ready for usage in a reduction kernel."""
+    # Lazy import to avoid circular dependency
+    from pygpu._array import ndgpuarray
+
+    # Get a context and an array class to work with. Use the "highest"
+    # class present in the inputs.
+    need_context = True
+    cls = None
+    for ary in (a, out):
+        if isinstance(ary, GpuArray):
+            if context is not None and ary.context != context:
+                raise ValueError('cannot mix contexts')
+            context = ary.context
+            if cls is None or cls == GpuArray:
+                cls = ary.__class__
+            need_context = False
+
+    if need_context and context is None:
+        context = get_default_context()
+        cls = ndgpuarray
+
+    if not isinstance(a, GpuArray):
+        a = numpy.asarray(a)
+        if a.flags.f_contiguous and not a.flags.c_contiguous:
+            order = 'F'
+        else:
+            order = 'C'
+        a = array(a, dtype=a.dtype, copy=False, order=order, context=context,
+                  cls=cls)
+
+    return a
+
+
+def reduce_with_op(a, op, neutral, axis=None, dtype=None, out=None,
+                   keepdims=False, context=None):
+    """Reduce ``a`` using the operation ``op``.
+
+    This is a wrapper around `pygpu.reduction.reduce1` with signature
+    closer to NumPy and parameter ``keepdims``.
+
+    Parameters
+    ----------
+    a : `pygpu.gpuarray.GpuArray`
+        Array that should be reduced.
+    op : str
+        Operation to be used for reduction. The reduction logic is::
+
+            result = a op b
+
+    neutral : scalar
+        Neutral element of the operation fulfilling ``(n op a) == a``
+        for all ``a``. It is used as initial state of the result.
+
+    axis, dtype, out, keepdims :
+        Arguments as in NumPy reductions. See e.g. `numpy.sum`.
+    context : `pygpu.gpuarray.GpuContext`, optional
+        Use this GPU context to evaluate the GPU kernel. For ``None``,
+        and if neither ``out`` nor ``a`` are GPU arrays, a default
+        GPU context must have been set.
+
+    Returns
+    -------
+    reduced : `pygpu.gpuarray.GpuArray` or scalar
+        If not all axes are reduced over or ``out`` was given, the result is
+        an array, a reference to ``out`` if provided. Otherwise, i.e. for
+        reductions along all axes without ``out`` parameter, the result
+        is a scalar.
+    """
+    a = _prepare_array_for_reduction(a, out, context)
+
+    if dtype is None:
+        if numpy.issubsctype(a.dtype, numpy.unsignedinteger):
+            # Avoid overflow for small integer types by default, as in Numpy
+            out_type = max(a.dtype, numpy.dtype('uint'))
+        elif numpy.issubsctype(a.dtype, numpy.integer):
+            out_type = max(a.dtype, numpy.dtype('int'))
+        else:
+            out_type = a.dtype
+    else:
+        out_type = dtype
+
+    axes = axis if axis is not None else tuple(range(a.ndim))
+    if out is not None:
+        out_arr = out.reshape(reduce_dims(a.shape, axes))
+    else:
+        out_arr = out
+
+    r = reduce1(a, op=op, neutral=neutral, out_type=out_type, axis=axis,
+                out=out_arr)
+    if keepdims:
+        newshape = restore_reduced_dims(r.shape, axes)
+        r = r.reshape(newshape)
+    if not r.shape and out is None:
+        return numpy.asarray(r).reshape([1])[0]
+    elif out is not None:
+        return out
+    else:
+        return r
+
+
+def sum(a, axis=None, dtype=None, out=None, keepdims=False, context=None):
+    """Sum of array elements over a given axis.
+
+    See Also
+    --------
+    numpy.sum
+    """
+    # Do what Numpy does with booleans, sensible or not
+    if a.dtype == bool and dtype is None:
+        dtype = int
+    return reduce_with_op(a, '+', 0, axis, dtype, out, keepdims, context)
+
+
+def prod(a, axis=None, dtype=None, out=None, keepdims=False, context=None):
+    """Return the product of array elements over a given axis.
+
+    See Also
+    --------
+    numpy.prod
+    """
+    # Do what Numpy does with booleans, sensible or not
+    if a.dtype == bool and dtype is None:
+        dtype = int
+    return reduce_with_op(a, '*', 1, axis, dtype, out, keepdims, context)
+
+
+def all(a, axis=None, out=None, keepdims=False, context=None):
+    """Test whether all array elements along a given axis evaluate to True.
+
+    See Also
+    --------
+    numpy.all
+    """
+    return reduce_with_op(a, '&&', 1, axis, numpy.bool, out, keepdims, context)
+
+
+def any(a, axis=None, out=None, keepdims=False, context=None):
+    """Test whether all array elements along a given axis evaluate to True.
+
+    See Also
+    --------
+    numpy.all
+    """
+    return reduce_with_op(a, '||', 0, axis, numpy.bool, out, keepdims, context)
+
+
+# --- Reductions with comparison operators --- #
+
+
+def reduce_with_cmp(a, cmp, neutral, axis=None, out=None, keepdims=False,
+                    context=None):
+    """Reduce ``a`` by comparison using ``cmp``.
+
+    This is a wrapper around `pygpu.reduction.reduce1` with signature
+    closer to NumPy and parameter ``keepdims``.
+
+    Parameters
+    ----------
+    a : `pygpu.gpuarray.GpuArray`
+        Array that should be reduced.
+    cmp : str
+        Comparison operator to be used for reduction. The reduction
+        logic is::
+
+            result = (a cmp b) ? a : b
+
+    neutral : scalar
+        Neutral element of the comparison fulfilling ``(n cmp a) == True``
+        for all ``a``. It is used as initial state of the result.
+
+    axis, out, keepdims :
+        Arguments as in NumPy reductions. See e.g. `numpy.amax`.
+    context : `pygpu.gpuarray.GpuContext`, optional
+        Use this GPU context to evaluate the GPU kernel. For ``None``,
+        and if neither ``out`` nor ``a`` are GPU arrays, a default
+        GPU context must have been set.
+
+    Returns
+    -------
+    reduced : `pygpu.gpuarray.GpuArray` or scalar
+        If not all axes are reduced over or ``out`` was given, the result is
+        an array, a reference to ``out`` if provided. Otherwise, i.e. for
+        reductions along all axes without ``out`` parameter, the result
+        is a scalar.
+    """
+    a = _prepare_array_for_reduction(a, out, context)
+
+    axes = axis if axis is not None else tuple(range(a.ndim))
+    if out is not None:
+        out_arr = out.reshape(reduce_dims(a.shape, axes))
+    else:
+        out_arr = out
+
+    oper = '(a {} b) ? a : b'.format(cmp)
+    r = reduce1(a, op=None, neutral=neutral, out_type=a.dtype, axis=axis,
+                out=out_arr, oper=oper)
+    if keepdims:
+        axes = axis if axis is not None else tuple(range(a.ndim))
+        newshape = restore_reduced_dims(r.shape, axes)
+        r = r.reshape(newshape)
+    if not r.shape and out is None:
+        return numpy.asarray(r).reshape([1])[0]
+    elif out is not None:
+        return out
+    else:
+        return r
+
+
+def amin(a, axis=None, out=None, keepdims=False, context=None):
+    """Return the minimum of an array or minimum along an axis.
+
+    See Also
+    --------
+    numpy.amin
+    """
+    a = _prepare_array_for_reduction(a, out)
+    if a.dtype == numpy.bool:
+        neutral = 1
+    elif numpy.issubsctype(a.dtype, numpy.integer):
+        neutral = numpy.iinfo(a.dtype).max
+    elif numpy.issubsctype(a.dtype, numpy.floating):
+        neutral = 'INFINITY'
+    elif numpy.issubsctype(a.dtype, numpy.complexfloating):
+        raise ValueError('array dtype {!r} not comparable'
+                         ''.format(a.dtype.name))
+    else:
+        raise ValueError('array dtype {!r} not supported'
+                         ''.format(a.dtype.name))
+    return reduce_with_cmp(a, '<', neutral, axis, out, keepdims, context)
+
+
+def amax(a, axis=None, out=None, keepdims=False, context=None):
+    """Return the maximum of an array or minimum along an axis.
+
+    See Also
+    --------
+    numpy.amax
+    """
+    a = _prepare_array_for_reduction(a, out)
+    if a.dtype == numpy.bool:
+        neutral = 0
+    elif numpy.issubsctype(a.dtype, numpy.integer):
+        neutral = numpy.iinfo(a.dtype).min
+    elif numpy.issubsctype(a.dtype, numpy.floating):
+        neutral = '-INFINITY'
+    elif numpy.issubsctype(a.dtype, numpy.complexfloating):
+        raise ValueError('array dtype {!r} not comparable'
+                         ''.format(a.dtype.name))
+    else:
+        raise ValueError('array dtype {!r} not supported'
+                         ''.format(a.dtype.name))
+    return reduce_with_cmp(a, '>', neutral, axis, out, keepdims, context)
+
+
+# --- Elementwise ufuncs --- #
+
+
+# This dictionary is derived from Numpy's C99_FUNCS list, see
+# https://github.com/numpy/numpy/search?q=C99_FUNCS
+UNARY_C_FUNC_TO_UFUNC = {
+    'abs': 'absolute',
+    'acos': 'arccos',
+    'acosh': 'arccosh',
+    'asin': 'arcsin',
+    'asinh': 'arcsinh',
+    'atan': 'arctan',
+    'atanh': 'arctanh',
+    'cbrt': 'cbrt',
+    'ceil': 'ceil',
+    'cos': 'cos',
+    'cosh': 'cosh',
+    'exp': 'exp',
+    'exp2': 'exp2',
+    'expm1': 'expm1',
+    'fabs': 'fabs',
+    'floor': 'floor',
+    'log': 'log',
+    'log10': 'log10',
+    'log1p': 'log1p',
+    'log2': 'log2',
+    'rint': 'rint',
+    'sin': 'sin',
+    'sinh': 'sinh',
+    'sqrt': 'sqrt',
+    'tan': 'tan',
+    'tanh': 'tanh',
+    'trunc': 'trunc',
+    }
+UNARY_UFUNC_TO_C_FUNC = {v: k for k, v in UNARY_C_FUNC_TO_UFUNC.items()}
+UNARY_UFUNC_TO_C_OP = {
+    'bitwise_not': '~',
+    'logical_not': '!',
+    'negative': '-',
+    }
+UNARY_UFUNCS = (list(UNARY_UFUNC_TO_C_FUNC.keys()) +
+                list(UNARY_UFUNC_TO_C_OP.keys()))
+UNARY_UFUNCS.extend(['deg2rad', 'rad2deg', 'reciprocal', 'sign', 'signbit',
+                     'square', 'isinf', 'isnan', 'isfinite', 'spacing'])
+UNARY_UFUNCS_TWO_OUT = ['frexp', 'modf']
+BINARY_C_FUNC_TO_UFUNC = {
+    'atan2': 'arctan2',
+    'copysign': 'copysign',
+    'hypot': 'hypot',
+    'ldexp': 'ldexp',
+    'pow': 'power',
+    'nextafter': 'nextafter',
+    'fmod': 'fmod',
+}
+BINARY_UFUNC_TO_C_FUNC = {v: k for k, v in BINARY_C_FUNC_TO_UFUNC.items()}
+BINARY_UFUNC_TO_C_BINOP = {
+    'add': '+',
+    'bitwise_and': '&',
+    'bitwise_or': '|',
+    'bitwise_xor': '^',
+    'left_shift': '<<',
+    'logical_and': '&&',
+    'logical_or': '||',
+    'multiply': '*',
+    'right_shift': '>>',
+    'subtract': '-',
+    }
+BINARY_UFUNC_TO_C_CMP = {
+    'equal': '==',
+    'greater': '>',
+    'greater_equal': '>=',
+    'less': '<',
+    'less_equal': '<=',
+    'not_equal': '!=',
+    }
+BINARY_UFUNC_TO_C_OP = {}
+BINARY_UFUNC_TO_C_OP.update(BINARY_UFUNC_TO_C_BINOP)
+BINARY_UFUNC_TO_C_OP.update(BINARY_UFUNC_TO_C_CMP)
+BINARY_UFUNCS = (list(BINARY_UFUNC_TO_C_FUNC.keys()) +
+                 list(BINARY_UFUNC_TO_C_OP.keys()))
+BINARY_UFUNCS.extend(['floor_divide', 'true_divide', 'logical_xor',
+                      'maximum', 'minimum', 'remainder', 'logaddexp',
+                      'logaddexp2'])
+
+
+def ufunc_dtypes(ufunc_name, dtypes_in):
+    """Return result dtypes for a ufunc and input dtypes.
+
+    Parameters
+    ----------
+    ufunc_name : str
+        Name of the Numpy ufunc.
+    dtypes_in : sequence of `numpy.dtype`
+        Data types of the arrays.
+
+    Returns
+    -------
+    prom_in_dtypes : tuple of `numpy.dtype`
+        Promoted input dtypes, different from ``dtypes_in`` if type
+        promotion is necessary for the ufunc.
+    result_dtypes : tuple of `numpy.dtype`
+        Resulting data types of the ufunc. If a function is not suited
+        for integers, the dtype is promoted to the smallest possible
+        supported floating point data type.
+
+    Examples
+    --------
+    Real floating point types:
+
+    >>> ufunc_dtypes('absolute', [numpy.dtype('float64')])
+    ((dtype('float64'),), (dtype('float64'),))
+    >>> ufunc_dtypes('absolute', [numpy.dtype('float32')])
+    ((dtype('float32'),), (dtype('float32'),))
+    >>> ufunc_dtypes('power', [numpy.dtype('float32'), numpy.dtype('float64')])
+    ((dtype('float64'), dtype('float64')), (dtype('float64'),))
+    >>> ufunc_dtypes('power', [numpy.dtype('float32'), numpy.dtype('float32')])
+    ((dtype('float32'), dtype('float32')), (dtype('float32'),))
+
+    Integer types -- some functions produce integer results, others
+    need to convert to floating point:
+
+    >>> ufunc_dtypes('absolute', [numpy.dtype('int8')])
+    ((dtype('int8'),), (dtype('int8'),))
+    >>> ufunc_dtypes('absolute', [numpy.dtype('int16')])
+    ((dtype('int16'),), (dtype('int16'),))
+    >>> ufunc_dtypes('exp', [numpy.dtype('int8')])
+    ((dtype('float32'),), (dtype('float32'),))
+    >>> ufunc_dtypes('exp', [numpy.dtype('int16')])
+    ((dtype('float32'),), (dtype('float32'),))
+    >>> ufunc_dtypes('exp', [numpy.dtype('int32')])
+    ((dtype('float64'),), (dtype('float64'),))
+    >>> ufunc_dtypes('power', [numpy.dtype('int8'), numpy.dtype('int8')])
+    ((dtype('int8'), dtype('int8')), (dtype('int8'),))
+    >>> ufunc_dtypes('power', [numpy.dtype('int8'), numpy.dtype('float32')])
+    ((dtype('float32'), dtype('float32')), (dtype('float32'),))
+    """
+    npy_ufunc = getattr(numpy, ufunc_name)
+    supported_dtypes = set(NAME_TO_DTYPE.values())
+
+    # Filter for dtypes larger than our input dtypes, using only supported ones
+    def larger_eq_than_dtypes(sig):
+        from_part = sig.split('->')[0]
+        if len(from_part) != len(dtypes_in):
+            return False
+        else:
+            dts = tuple(numpy.dtype(c) for c in from_part)
+            # Currently unsupported, filtering out
+            if builtin_any(dt not in supported_dtypes for dt in dts):
+                return False
+            else:
+                return builtin_all(dt >= dt_in
+                                   for dt, dt_in in zip(dts, dtypes_in))
+
+    # List of ufunc signatures that are "larger" than our input dtypes
+    larger_sig_list = list(filter(larger_eq_than_dtypes, npy_ufunc.types))
+    if not larger_sig_list:
+        # Numpy raises TypeError for bad data types, which is not quite right,
+        # but we mirror that behavior
+        raise TypeError('data types {} not supported for ufunc {}'
+                        ''.format(tuple(dt.name for dt in dtypes_in),
+                                  ufunc_name))
+
+    # Key function for signature comparison. It results in comparison of
+    # *all* typecodes in the signature since they are assembled in a tuple.
+    def from_part_key(sig):
+        from_part = sig.split('->')[0]
+        return tuple(numpy.dtype(c) for c in from_part)
+
+    # Get the smallest signature larger than our input dtypes
+    smallest_sig = min(larger_sig_list, key=from_part_key)
+    smallest_str_in, smallest_str_out = smallest_sig.split('->')
+    prom_dtypes = tuple(numpy.dtype(c) for c in smallest_str_in)
+    result_dtypes = tuple(numpy.dtype(c) for c in smallest_str_out)
+
+    # Quad precision unsupported also on output side
+    if builtin_any(dt in result_dtypes for dt in (numpy.dtype('float16'),
+                                                  numpy.dtype('float128'))):
+        # TODO: Numpy raises TypeError for bad data types, which is wrong,
+        # but we mirror that behavior
+        raise TypeError('data types {} not supported for ufunc {}'
+                        ''.format(tuple(dt.name for dt in dtypes_in),
+                                  ufunc_name))
+
+    return prom_dtypes, result_dtypes
+
+
+def ufunc_c_fname(ufunc_name, dtypes_in):
+    """Return C function name for a ufunc and input dtypes.
+
+    The names are only specialized for complex math and to distinguish
+    between functions for absolute value. There is no name-mangling for
+    specific precisions.
+
+    Parameters
+    ----------
+    ufunc_name : str
+        Name of the Numpy ufunc.
+    dtype_in : sequence of `numpy.dtype`
+        Data types of input arrays.
+
+    Returns
+    -------
+    cname : str
+        C name of the ufunc.
+
+    Examples
+    --------
+    >>> ufunc_c_fname('exp', [numpy.dtype('float32')])
+    'exp'
+    >>> ufunc_c_fname('exp', [numpy.dtype('int16')])
+    'exp'
+    >>> ufunc_c_fname('absolute', [numpy.dtype('float64')])
+    'fabs'
+    >>> ufunc_c_fname('absolute', [numpy.dtype('float32')])
+    'fabs'
+    >>> ufunc_c_fname('absolute', [numpy.dtype('int8')])
+    'abs'
+    >>> ufunc_c_fname('power',
+    ...               [numpy.dtype('float32'), numpy.dtype('float64')])
+    'pow'
+
+    See Also
+    --------
+    ufunc_result_dtype : Get dtype of a ufunc result.
+    """
+    _, result_dtypes = ufunc_dtypes(ufunc_name, dtypes_in)
+    result_dtype = result_dtypes[0]
+    c_base_name = UNARY_UFUNC_TO_C_FUNC.get(ufunc_name, None)
+    if c_base_name is None:
+        c_base_name = BINARY_UFUNC_TO_C_FUNC.get(ufunc_name)
+
+    if c_base_name == 'abs':
+        if numpy.issubsctype(dtypes_in[0], numpy.floating):
+            c_base_name = 'fabs'
+
+    if numpy.issubsctype(result_dtype, numpy.complexfloating):
+        prefix = 'c'
+        # Currently broken
+        raise NotImplementedError('complex math kernels currently not '
+                                  'available')
+    else:
+        prefix = ''
+
+    return prefix + c_base_name
+
+
+def unary_ufunc(a, ufunc_name, out=None, context=None):
+    """Call a unary ufunc on an array ``a``.
+
+    Parameters
+    ----------
+    a : `array-like`
+        Input array.
+    ufunc_name : str
+        Name of the NumPy ufunc to be called on ``a``.
+    out : `pygpu.gpuarray.GpuArray`, optional
+        Array in which to store the result. Its shape must be equal to
+        ``a.shape`` and its dtype must be the result dtype of the called
+        function.
+    context : `pygpu.gpuarray.GpuContext`, optional
+        Use this GPU context to evaluate the GPU kernel. For ``None``,
+        and if neither ``out`` nor ``a`` are GPU arrays, a default
+        GPU context must have been set.
+
+    Returns
+    -------
+    out : `pygpu.gpuarray.GpuArray`
+        Result of the computation. If ``out`` was given, the returned
+        object is a reference to it. If ``a`` was not a GPU arrays, the
+        type of ``out`` is `pygpu._array.ndgpuarray`.
+
+    See Also
+    --------
+    ufunc_result_dtype : Get dtype of a ufunc result.
+    ufunc_c_funcname : Get C name for math ufuncs.
+    """
+    # Lazy import to avoid circular dependency
+    from pygpu._array import ndgpuarray
+
+    # Get a context and an array class to work with. Use the "highest"
+    # class present in the inputs.
+    need_context = True
+    cls = None
+    for ary in (a, out):
+        if isinstance(ary, GpuArray):
+            if context is not None and ary.context != context:
+                raise ValueError('cannot mix contexts')
+            context = ary.context
+            if cls is None or cls == GpuArray:
+                cls = ary.__class__
+            need_context = False
+
+    if need_context and context is None:
+        context = get_default_context()
+        cls = ndgpuarray
+
+    if not isinstance(a, GpuArray):
+        a = numpy.asarray(a)
+        if a.flags.f_contiguous and not a.flags.c_contiguous:
+            order = 'F'
+        else:
+            order = 'C'
+        a = array(a, dtype=a.dtype, copy=False, order=order, context=context,
+                  cls=cls)
+
+    if a.dtype == numpy.dtype('float16'):
+        # Gives wrong results currently, see
+        # https://github.com/Theano/libgpuarray/issues/316
+        raise NotImplementedError('float16 currently broken')
+
+    prom_dtypes_in, result_dtypes = ufunc_dtypes(ufunc_name, [a.dtype])
+    prom_dtype_in = prom_dtypes_in[0]
+    result_dtype = result_dtypes[0]
+
+    # This is the "fallback signature" case, for us it signals failure.
+    # TypeError is what Numpy raises, too, which is kind of wrong
+    if prom_dtype_in == numpy.dtype(object):
+        raise TypeError('input dtype {!r} invalid for ufunc {!r}'
+                        ''.format(a.dtype.name, ufunc_name))
+
+    # Convert input such that the kernel runs
+    # TODO: can this be avoided?
+    a = a.astype(prom_dtype_in, copy=False)
+
+    if out is None:
+        out = empty(a.shape, dtype=result_dtype, context=context, cls=cls)
+    else:
+        # TODO: allow larger dtype
+        if out.dtype != result_dtype:
+            raise ValueError('`out.dtype` != result dtype ({!r} != {!r})'
+                             ''.format(out.dtype.name, result_dtype.name))
+
+    # C result dtype for casting
+    c_res_dtype = dtype_to_ctype(result_dtype)
+
+    oper = ''
+
+    # Case 1: math function
+    if ufunc_name in UNARY_UFUNC_TO_C_FUNC:
+        c_func = ufunc_c_fname(ufunc_name, (a.dtype,))
+        # Shortcut for abs() with unsigned int. This also fixes a CUDA quirk
+        # that makes abs() fail with unsigned ints.
+        if (ufunc_name == 'absolute' and
+                numpy.issubsctype(a.dtype, numpy.unsignedinteger)):
+            out[:] = a.copy()
+            return out
+        else:
+            oper = 'res = ({}) {}(a)'.format(c_res_dtype, c_func)
+
+    # Case 2: unary operator
+    unop = UNARY_UFUNC_TO_C_OP.get(ufunc_name, None)
+    if unop is not None:
+        if a.dtype == numpy.bool and unop == '-':
+            if parse_version(numpy.__version__) >= parse_version('1.13'):
+                # Numpy >= 1.13 raises a TypeError
+                raise TypeError(
+                    'negation of boolean arrays is not supported, use '
+                    '`logical_not` instead')
+            else:
+                # Warn and remap to logical not
+                warnings.warn('using negation (`-`) with boolean arrays is '
+                              'deprecated, use `logical_not` (`~`) instead; '
+                              'the current behavior will be changed along '
+                              "with NumPy's", FutureWarning)
+                unop = '!'
+        oper = 'res = ({}) {}a'.format(c_res_dtype, unop)
+
+    # Other cases: specific functions
+    if ufunc_name == 'deg2rad':
+        oper = 'res = ({rdt})({:.45f}) * ({rdt}) a'.format(numpy.deg2rad(1.0),
+                                                           rdt=c_res_dtype)
+
+    elif ufunc_name == 'rad2deg':
+        oper = 'res = ({rdt})({:.45f}) * ({rdt}) a'.format(numpy.rad2deg(1.0),
+                                                           rdt=c_res_dtype)
+
+    elif ufunc_name == 'reciprocal':
+        oper = 'res = ({dt}) (({dt}) 1.0) / a'.format(dt=c_res_dtype)
+
+    elif ufunc_name == 'sign':
+        oper = 'res = ({}) ((a > 0) ? 1 : (a < 0) ? -1 : 0)'.format(
+            c_res_dtype)
+
+    elif ufunc_name == 'signbit':
+        oper = 'res = ({}) (a < 0)'.format(c_res_dtype)
+
+    elif ufunc_name == 'square':
+        oper = 'res = ({}) (a * a)'.format(c_res_dtype)
+
+    elif ufunc_name == 'isfinite':
+        oper = '''
+        res = ({}) (a != INFINITY && a != -INFINITY && !isnan(a))
+        '''.format(c_res_dtype)
+
+    elif ufunc_name == 'isinf':
+        oper = 'res = ({}) (a == INFINITY || a == -INFINITY)'.format(
+            c_res_dtype)
+
+    elif ufunc_name == 'isnan':
+        oper = 'res = ({}) (abs(isnan(a)))'.format(c_res_dtype)
+
+    elif ufunc_name == 'spacing':
+        if numpy.issubsctype(a.dtype, numpy.integer):
+            # TODO: float16 as soon as it is properly supported
+            cast_dtype = numpy.result_type(a.dtype, numpy.float32)
+            c_cast_dtype = dtype_to_ctype(cast_dtype)
+        else:
+            c_cast_dtype = dtype_to_ctype(a.dtype)
+        oper = '''
+        res = ({}) ((a < 0) ?
+                    nextafter(({ct}) a, ({ct}) a - 1) - a :
+                    nextafter(({ct}) a, ({ct}) a + 1) - a)
+        '''.format(c_res_dtype, ct=c_cast_dtype)
+
+    if not oper:
+        raise ValueError('`ufunc_name` {!r} does not represent a unary ufunc'
+                         ''.format(ufunc_name))
+
+    a_arg = as_argument(a, 'a', read=True)
+    args = [arg('res', out.dtype, write=True), a_arg]
+
+    ker = GpuElemwise(context, oper, args, convert_f16=True)
+    ker(out, a)
+    return out
+
+
+def binary_ufunc(a, b, ufunc_name, out=None, context=None):
+    """Call binary ufunc on ``a`` and ``b``.
+
+    Parameters
+    ----------
+    a, b : `array-like`
+        Input arrays.
+    ufunc_name : str
+        Name of the NumPy ufunc to be called on ``a`` and ``b``.
+    out : `pygpu.gpuarray.GpuArray`, optional
+        Array in which to store the result. Its shape must be equal to
+        ``a.shape`` and its dtype must be the result dtype of the called
+        function.
+    context : `pygpu.gpuarray.GpuContext`, optional
+        Use this GPU context to evaluate the GPU kernel. For ``None``,
+        and if neither ``out`` nor ``a`` are GPU arrays, a default
+        GPU context must have been set.
+
+    Returns
+    -------
+    out : `pygpu.gpuarray.GpuArray`
+        Result of the computation. If ``out`` was given, the returned
+        object is a reference to it. If both input arrays were not
+        GPU arrays, the type of ``out`` is `pygpu._array.ndgpuarray`.
+
+    See Also
+    --------
+    pygpu.gpuarray.set_default_context
+    """
+    # Lazy import to avoid circular dependency
+    from pygpu._array import ndgpuarray
+
+    # Get a context and an array class to work with
+    cls = None
+    need_context = True
+    for ary in (a, b, out):
+        if isinstance(a, GpuArray):
+            context = ary.context
+            cls = ary.__class__
+            need_context = False
+            break
+
+    if need_context and context is None:
+        context = get_default_context()
+        cls = ndgpuarray
+
+    if not isinstance(a, GpuArray):
+        if numpy.isscalar(a):
+            # TODO: this is quite hacky, perhaps mixed input signatures
+            # should be handled in ufunc_dtypes?
+            if ufunc_name == 'ldexp':
+                # Want signed type
+                # TODO: this upcasts to a larger dtype than Numpy in
+                # some cases (mostly unsigned int)
+                a = numpy.asarray(a, dtype=numpy.min_scalar_type(-abs(a)))
+            else:
+                a = numpy.asarray(a, dtype=numpy.result_type(a, b))
+
+        if a.flags.f_contiguous and not a.flags.c_contiguous:
+            order = 'F'
+        else:
+            order = 'C'
+        a = array(a, dtype=a.dtype, copy=False, order=order, context=context,
+                  cls=cls)
+
+    if not isinstance(b, GpuArray):
+        if numpy.isscalar(b):
+            # TODO: this is quite hacky, perhaps mixed input signatures
+            # should be handled in ufunc_dtypes?
+            if ufunc_name == 'ldexp':
+                # Want signed type
+                b = numpy.asarray(b, dtype=numpy.min_scalar_type(-abs(b)))
+            else:
+                b = numpy.asarray(b, dtype=numpy.result_type(a, b))
+
+        if isinstance(b, numpy.ndarray):
+            if b.flags.f_contiguous and not b.flags.c_contiguous:
+                order = 'F'
+            else:
+                order = 'C'
+        else:
+            b = numpy.asarray(b)
+            order = 'C'
+        b = array(b, dtype=b.dtype, copy=False, order=order, context=context,
+                  cls=cls)
+
+    if builtin_any(ary.dtype == numpy.dtype('float16') for ary in (a, b)):
+        # Gives wrong results currently, see
+        # https://github.com/Theano/libgpuarray/issues/316
+        raise NotImplementedError('float16 currently broken')
+
+    prom_dtypes_in, result_dtypes = ufunc_dtypes(ufunc_name,
+                                                 [a.dtype, b.dtype])
+    prom_dtype_a, prom_dtype_b = prom_dtypes_in
+    result_dtype = result_dtypes[0]
+
+    # This is the "fallback signature" case, for us it signals failure
+    if builtin_any(dt == numpy.dtype(object) for dt in prom_dtypes_in):
+        raise TypeError('input dtypes {} invalid for ufunc {!r}'
+                        ''.format((a.dtype.name, b.dtype.name), ufunc_name))
+
+    # Convert input such that the kernel runs
+    # TODO: can this be avoided?
+    a = a.astype(prom_dtype_a, copy=False)
+    b = b.astype(prom_dtype_b, copy=False)
+
+    if a.ndim != b.ndim:
+        nd = max(a.ndim, b.ndim)
+        if a.ndim < nd:
+            a = a.reshape(((1,) * (nd - a.ndim)) + a.shape)
+        if b.ndim < nd:
+            b = b.reshape(((1,) * (nd - b.ndim)) + b.shape)
+    result_shape = tuple(max(sa, sb) for sa, sb in zip(a.shape, b.shape))
+
+    if out is None:
+        out = empty(result_shape, dtype=result_dtype, context=context, cls=cls)
+    else:
+        if out.shape != result_shape:
+            raise ValueError('`out.shape` != result shape ({} != {})'
+                             ''.format(out.shape, result_shape))
+        if out.dtype != result_dtype:
+            raise ValueError('`out.dtype` != result dtype ({!r} != {!r})'
+                             ''.format(out.dtype.name, result_dtype.name))
+
+    a_arg = as_argument(a, 'a', read=True)
+    b_arg = as_argument(b, 'b', read=True)
+    args = [arg('res', result_dtype, write=True), a_arg, b_arg]
+
+    # C result dtype for casting
+    c_res_dtype = dtype_to_ctype(result_dtype)
+
+    # Set string for mapping operation and preamble for additional functions
+    oper = ''
+    preamble = ''
+
+    # Case 1: math function
+    if ufunc_name in BINARY_UFUNC_TO_C_FUNC:
+        if ufunc_name == 'power':
+            # Arguments to `pow` cannot be integer, need to cast
+            if numpy.issubsctype(result_dtype, numpy.integer):
+                tpl = 'res = ({rt}) (long) round(pow((double) a, (double) b))'
+                oper = tpl.format(rt=c_res_dtype)
+            else:
+                oper = 'res = ({rt}) pow(({rt}) a, ({rt}) b)'.format(
+                    rt=c_res_dtype)
+
+        elif ufunc_name == 'fmod':
+            # Arguments to `fmod` cannot be integer, need to cast
+            if numpy.issubsctype(result_dtype, numpy.integer):
+                oper = 'res = ({rt}) fmod((double) a, (double) b)'.format(
+                    rt=c_res_dtype)
+            else:
+                oper = 'res = ({rt}) fmod(({rt}) a, ({rt}) b)'.format(
+                    rt=c_res_dtype)
+
+        else:
+            c_fname = ufunc_c_fname(ufunc_name, [a.dtype, b.dtype])
+            oper = 'res = ({}) {}(a, b)'.format(c_res_dtype, c_fname)
+
+    # Case 2: binary operator
+    binop = BINARY_UFUNC_TO_C_OP.get(ufunc_name, None)
+    if binop is not None:
+        if b.dtype == numpy.bool and binop == '-':
+            if parse_version(numpy.__version__) >= parse_version('1.13'):
+                # Numpy >= 1.13 raises a TypeError
+                raise TypeError(
+                    'subtraction of boolean arrays is not supported, use '
+                    '`logical_not` instead')
+            else:
+                # Warn and remap to logical not
+                warnings.warn('using subtraction (`-`) with boolean arrays is '
+                              'deprecated, use `bitwise_xor` (`^`) or '
+                              '`logical_xor` instead; '
+                              'the current behavior will be changed along '
+                              "with NumPy's", FutureWarning)
+                binop = '^'
+
+        oper = 'res = ({}) (a {} b)'.format(c_res_dtype, binop)
+    else:
+        # Other cases: specific functions
+        if ufunc_name == 'floor_divide':
+            # implement as sign(a/b) * int(abs(a/b) + shift(a,b))
+            # where shift(a,b) = 0 if sign(a) == sign(b) else 1 - epsilon
+            preamble = '''
+            WITHIN_KERNEL long
+            floor_div_dbl(double a, double b) {
+                double quot = a / b;
+                if ((a < 0) != (b < 0)) {
+                    return - (long) (quot + 0.999);
+                } else {
+                    return (long) quot;
+                }
+            }
+            '''
+            oper = 'res = ({}) floor_div_dbl((double) a, (double) b)'.format(
+                c_res_dtype)
+
+        elif ufunc_name == 'true_divide':
+            if result_dtype == numpy.dtype('float64'):
+                flt = 'double'
+            else:
+                flt = 'float'
+            oper = 'res = ({}) (({flt}) a / ({flt}) b)'.format(c_res_dtype,
+                                                               flt=flt)
+
+        elif ufunc_name == 'logical_xor':
+            oper = 'res = ({}) (a ? !b : b)'.format(c_res_dtype)
+
+        elif ufunc_name == 'maximum':
+            oper = 'res = ({}) ((a > b) ? a : b)'.format(c_res_dtype)
+
+        elif ufunc_name == 'minimum':
+            oper = 'res = ({}) ((a < b) ? a : b)'.format(c_res_dtype)
+
+        elif ufunc_name == 'remainder':
+            # The same as `fmod` except for b < 0, where we have
+            # remainder(a, b) = fmod(a, b) + b
+            if numpy.issubsctype(result_dtype, numpy.integer):
+                cast_type = 'double'
+            else:
+                cast_type = c_res_dtype
+
+            preamble = mako.template.Template('''
+            WITHIN_KERNEL ${ct}
+            rem(${ct} a, ${ct} b) {
+                ${ct} modval = fmod(a, b);
+                if (b < 0 && modval != 0) {
+                    return b + modval;
+                } else {
+                    return modval;
+                }
+            }
+            ''').render(ct=cast_type)
+            oper = 'res = ({rt}) rem(({ct}) a, ({ct}) b)'.format(
+                rt=c_res_dtype, ct=cast_type)
+
+        elif ufunc_name == 'logaddexp':
+            oper = 'res = ({}) log(exp(a) + exp(b))'.format(c_res_dtype)
+
+        elif ufunc_name == 'logaddexp2':
+            oper = '''
+            res = ({}) log(exp(a * log(2.0)) + exp(b * log(2.0))) / log(2.0)
+            '''.format(c_res_dtype)
+
+    if not oper:
+        raise ValueError('`ufunc_name` {!r} does not represent a binary '
+                         'ufunc'.format(ufunc_name))
+
+    kernel = GpuElemwise(a.context, oper, args, convert_f16=True,
+                         preamble=preamble)
+    kernel(out, a, b, broadcast=True)
+    return out
+
+
+def unary_ufunc_two_out(a, ufunc_name, out1=None, out2=None, context=None):
+    """Call a unary ufunc with two outputs on an array ``a``.
+
+    Parameters
+    ----------
+    a : `array-like`
+        Input array.
+    ufunc_name : str
+        Name of the NumPy ufunc to be called on ``a``.
+    out1, out2 : `pygpu.gpuarray.GpuArray`, optional
+        Arrays in which to store the result. Their shape must be equal to
+        ``a.shape`` and their dtype must be the result dtypes of the
+        called function.
+        The arrays ``out1`` and ``out2`` must either both be provided,
+        or none of them.
+    context : `pygpu.gpuarray.GpuContext`, optional
+        Use this GPU context to evaluate the GPU kernel. For ``None``,
+        and if neither ``out1`` nor ``out2`` nor ``a`` are GPU arrays,
+        a default GPU context must have been set.
+
+    Returns
+    -------
+    out1, out2 : `pygpu.gpuarray.GpuArray`
+        Results of the computation. If ``out1``, ``out2`` were given,
+        the returned object is a reference to it.
+        If ``out1`` and ``out2`` are ``None`` and ``a`` is not a GPU array,
+        the result arrays ``out1`` and ``out2`` are instances of
+        `pygpu._array.ndgpuarray`.
+
+    See Also
+    --------
+    ufunc_result_dtype : Get dtype of a ufunc result.
+    ufunc_c_funcname : Get C name for math ufuncs.
+    """
+    # Lazy import to avoid circular dependency
+    from pygpu._array import ndgpuarray
+
+    # Get a context and an array class to work with. Use the "highest"
+    # class present in the inputs.
+    need_context = True
+    cls = None
+    for ary in (a, out1, out2):
+        if isinstance(ary, GpuArray):
+            if context is not None and ary.context != context:
+                raise ValueError('cannot mix contexts')
+            context = ary.context
+            if cls is None or cls == GpuArray:
+                cls = ary.__class__
+            need_context = False
+
+    if need_context and context is None:
+        context = get_default_context()
+        cls = ndgpuarray
+
+    if not isinstance(a, GpuArray):
+        a = numpy.asarray(a)
+        if a.flags.f_contiguous and not a.flags.c_contiguous:
+            order = 'F'
+        else:
+            order = 'C'
+        a = array(a, dtype=a.dtype, copy=False, order=order, context=context,
+                  cls=cls)
+
+    if a.dtype == numpy.dtype('float16'):
+        # Gives wrong results currently, see
+        # https://github.com/Theano/libgpuarray/issues/316
+        raise NotImplementedError('float16 currently broken')
+
+    prom_dtypes_in, result_dtypes = ufunc_dtypes(ufunc_name, [a.dtype])
+    prom_dtype_in = prom_dtypes_in[0]
+    result_dtype1, result_dtype2 = result_dtypes
+
+    # This is the "fallback signature" case, for us it signals failure.
+    # TypeError is what Numpy raises, too, which is kind of wrong
+    if prom_dtype_in == numpy.dtype(object):
+        raise TypeError('input dtype {!r} invalid for ufunc {!r}'
+                        ''.format(a.dtype.name, ufunc_name))
+
+    # Convert input such that the kernel runs
+    # TODO: can this be avoided?
+    a = a.astype(prom_dtype_in, copy=False)
+
+    if out1 is None:
+        out1 = empty(a.shape, dtype=result_dtype1, context=context, cls=cls)
+    else:
+        # TODO: allow larger dtype
+        if out1.dtype != result_dtype1:
+            raise ValueError('`out1.dtype` != result dtype: {!r} != {!r}'
+                             ''.format(out1.dtype.name, result_dtype1.name))
+    if out2 is None:
+        out2 = empty(a.shape, dtype=result_dtype2, context=context, cls=cls)
+    else:
+        if out2.dtype != result_dtype2:
+            raise ValueError('`out2.dtype` != result dtype: {!r} != {!r}'
+                             ''.format(out2.dtype.name, result_dtype2.name))
+
+    # C result dtypes for casting
+    c_res_dtype1 = dtype_to_ctype(result_dtype1)
+
+    oper = ''
+
+    if ufunc_name == 'frexp':
+        oper = 'res = ({rdt1}) frexp(a, &out2)'.format(rdt1=c_res_dtype1)
+    elif ufunc_name == 'modf':
+        oper = 'res = ({rdt1}) modf(a, &out2)'.format(rdt1=c_res_dtype1)
+
+    if not oper:
+        raise ValueError('`ufunc_name` {!r} does not represent a unary ufunc'
+                         ''.format(ufunc_name))
+
+    a_arg = as_argument(a, 'a', read=True)
+    args = [arg('res', out1.dtype, write=True),
+            arg('out2', out2.dtype, write=True),
+            a_arg]
+
+    ker = GpuElemwise(context, oper, args, convert_f16=True)
+    ker(out1, out2, a)
+    return out1, out2
+
+
+# TODO: add ufuncs conditionally depending on Numpy version?
+MISSING_UFUNCS = [
+    'conjugate',  # no complex dtype yet
+    'float_power',  # new in Numpy 1.12
+    'divmod',  # new in Numpy 1.13
+    'heaviside',  # new in Numpy 1.13
+    'isnat',  # new in Numpy 1.13
+    'positive',  # new in Numpy 1.13
+    ]
+
+
+UFUNC_SYNONYMS = [
+    ('absolute', 'abs'),
+    # ('conjugate', 'conj'),
+    ('deg2rad', 'degrees'),
+    ('rad2deg', 'radians'),
+    ('true_divide', 'divide'),
+    ('maximum', 'fmax'),  # TODO: differ in NaN propagation in numpy, doable?
+    ('minimum', 'fmin'),  # TODO: differ in NaN propagation in numpy, doable?
+    ('bitwise_not', 'invert'),
+    ('remainder', 'mod')
+    ]
+
+
+# --- Reductions from binary ufuncs --- #
+
+
+def make_binary_ufunc_reduce(name):
+    npy_ufunc = getattr(numpy, name)
+    binop = BINARY_UFUNC_TO_C_BINOP.get(name, None)
+    if binop is not None:
+
+        def reduce_wrapper(a, axis=0, dtype=None, out=None, keepdims=False,
+                           context=None):
+            return reduce_with_op(a, binop, npy_ufunc.identity, axis, dtype,
+                                  out, keepdims, context)
+
+        return reduce_wrapper
+
+    cmp = BINARY_UFUNC_TO_C_CMP.get(name, None)
+    if cmp is not None:
+
+        def reduce_wrapper(a, axis=0, dtype=None, out=None, keepdims=False,
+                           context=None):
+            return reduce_with_cmp(a, cmp, npy_ufunc.identity, axis, dtype,
+                                   out, keepdims, context)
+
+        return reduce_wrapper
+
+    # TODO: add reduction with binary function, not possible currently since
+    # no sensible "neutral" can be defined. We need a variant of `reduce1`
+    # that initializes the accumulator with the first element along a given
+    # reduction axis.
+    return None
+
+
+# --- Add (incomplete) ufunc class --- #
+
+
+# TODO: find out how to use one Ufunc class with __call__ signature depending
+# on nin and nout. What doesn't work:
+# - Setting __call__ on the class in __new__ since later instantiations of
+#   the class overwrite previous ones (only one class object)
+#   object will not be registered as callable
+# - Setting __call__ on an instance (in __new__ or __init__), since the
+#   object will not be registered as callable
+# - Metaclass using __call__, need to find a more complete reference
+#
+# We need some way of setting the __call__ signature of an instance,
+# probably by modifying __call__.__code__.
+
+
+class UfuncBase(object):
+
+    def __init__(self, name, nin, nout, call, **kwargs):
+        self.__name__ = name
+        self.nin = nin
+        self.nout = nout
+        self.nargs = self.nin + self.nout
+        self._call = call
+
+        self.identity = kwargs.pop('identity', None)
+
+        # Wrappers for unimplemented stuff
+
+        def _at_not_impl(a, indices, b=None, context=None):
+            return NotImplemented
+
+        def _accumulate_not_impl(array, axis=0, dtype=None, out=None,
+                                 keepdims=None, context=None):
+            return NotImplemented
+
+        def _outer_not_impl(A, B, context=None, **kwargs):
+            return NotImplemented
+
+        def _reduce_not_impl(a, axis=0, dtype=None, out=None,
+                             keepdims=False, context=None):
+            return NotImplemented
+
+        def _reduceat_not_impl(a, indices, axis=0, dtype=None, out=None,
+                               context=None):
+            return NotImplemented
+
+        self.accumulate = kwargs.pop('accumulate', _accumulate_not_impl)
+        self.accumulate.__name__ = 'accumulate'
+        if PY3:
+            self.accumulate.__qualname__ = name + '.accumulate'
+
+        self.at = kwargs.pop('at', _at_not_impl)
+        self.at.__name__ = 'at'
+        if PY3:
+            self.at.__qualname__ = name + '.at'
+
+        self.outer = kwargs.pop('outer', _outer_not_impl)
+        self.outer.__name__ = 'outer'
+        if PY3:
+            self.outer.__qualname__ = name + '.outer'
+
+        reduce = kwargs.pop('reduce', None)
+        if reduce is None:
+            self.reduce = _reduce_not_impl
+        else:
+            self.reduce = reduce
+        self.reduce.__name__ = 'reduce'
+        if PY3:
+            self.reduce.__qualname__ = name + '.reduce'
+
+        self.reduceat = kwargs.pop('reduceat', _reduceat_not_impl)
+        self.reduceat.__name__ = 'reduceat'
+        if PY3:
+            self.reduceat.__qualname__ = name + '.reduceat'
+
+    def __repr__(self):
+        return '<ufunc {}>'.format(self.__name__)
+
+
+class Ufunc01(UfuncBase):
+
+    def __call__(self, out=None, context=None):
+        return self._call(out=out, context=context)
+
+
+class Ufunc11(UfuncBase):
+
+    def __call__(self, x, out=None, context=None):
+        return self._call(x, out=out, context=context)
+
+
+class Ufunc12(UfuncBase):
+
+    def __call__(self, x, out1=None, out2=None, context=None):
+        return self._call(x, out1=out1, out2=out2, context=context)
+
+
+class Ufunc21(UfuncBase):
+
+    def __call__(self, x1, x2, out=None, context=None):
+        return self._call(x1, x2, out=out, context=context)
+
+
+# --- Add ufuncs to global namespace --- #
+
+
+def make_ufunc(name):
+    npy_ufunc = getattr(numpy, name)
+    nin = npy_ufunc.nin
+    nout = npy_ufunc.nout
+
+    if nin == 1 and nout == 1:
+        cls = Ufunc11
+
+        def call(a, out=None, context=None):
+            return unary_ufunc(a, name, out, context)
+
+    elif nin == 1 and nout == 2:
+        cls = Ufunc12
+
+        def call(a, out1=None, out2=None, context=None):
+            return unary_ufunc_two_out(a, name, out1, out2, context)
+
+    elif nin == 2 and nout == 1:
+        cls = Ufunc21
+
+        def call(a, b, out=None, context=None):
+            return binary_ufunc(a, b, name, out, context)
+
+    else:
+        raise NotImplementedError('nin = {}, nout = {} not supported'
+                                  ''.format(nin, nout))
+
+    call.__name__ = name
+    if PY3:
+        call.__qualname__ = name
+    # TODO: add docstring
+
+    if nin == 1:
+        ufunc = cls(name, nin, nout, call, identity=npy_ufunc.identity)
+    else:
+        ufunc = cls(name, nin, nout, call, identity=npy_ufunc.identity,
+                    reduce=make_binary_ufunc_reduce(name))
+
+    return ufunc
+
+
+# Add the ufuncs to the module dictionary
+for name in UNARY_UFUNCS + UNARY_UFUNCS_TWO_OUT + BINARY_UFUNCS:
+    globals()[name] = make_ufunc(name)
+
+
+for name, alt_name in UFUNC_SYNONYMS:
+    globals()[alt_name] = globals()[name]
+
+
+if __name__ == '__main__':
+    from doctest import testmod, NORMALIZE_WHITESPACE
+    optionflags = NORMALIZE_WHITESPACE
+    testmod(optionflags=NORMALIZE_WHITESPACE, extraglobs={'np': numpy})