From 2b777fca012c399d1bcb003bf97ba68617a75c8e Mon Sep 17 00:00:00 2001
From: Gabriel Scherer <gabriel.scherer@gmail.com>
Date: Mon, 13 Jun 2022 13:09:24 +0200
Subject: [PATCH 01/10] parsing/builtin_attributes: support code for list
 literals

---
 parsing/builtin_attributes.ml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/parsing/builtin_attributes.ml b/parsing/builtin_attributes.ml
index c90542567ada..940bb1488da7 100644
--- a/parsing/builtin_attributes.ml
+++ b/parsing/builtin_attributes.ml
@@ -30,6 +30,22 @@ let string_of_opt_payload p =
   | Some s -> s
   | None -> ""
 
+let list_of_exp exp =
+  let rec loop acc = function
+  | {pexp_desc = Pexp_construct ({txt = Longident.Lident "[]"; _}, None)} ->
+      Ok (List.rev acc)
+  | {pexp_desc = Pexp_construct ({txt = Longident.Lident "::"; _},
+                                 Some {pexp_desc = Pexp_tuple [e1; e2]})} ->
+      loop (e1 :: acc) e2
+  | {pexp_loc = loc} ->
+      Error loc
+  in loop [] exp
+
+let list_of_payload loc = function
+  | PStr[{pstr_desc = Pstr_eval (li, _)}] ->
+      list_of_exp li
+  | _ -> Error loc
+
 let error_of_extension ext =
   let submessage_from main_loc main_txt = function
     | {pstr_desc=Pstr_extension

From 98e405bb1abcc8b2f0c08f673d07bc008c68d7c1 Mon Sep 17 00:00:00 2001
From: Gabriel Scherer <gabriel.scherer@gmail.com>
Date: Mon, 13 Jun 2022 13:10:22 +0200
Subject: [PATCH 02/10] parsing/builtin_attributes support for [@shape [int;
 double; lazy; object]]

---
 parsing/builtin_attributes.ml  | 46 ++++++++++++++++++++++++++++++++++
 parsing/builtin_attributes.mli |  2 ++
 utils/misc.ml                  | 15 +++++++++++
 utils/misc.mli                 | 15 +++++++++++
 4 files changed, 78 insertions(+)

diff --git a/parsing/builtin_attributes.ml b/parsing/builtin_attributes.ml
index 940bb1488da7..edc81760849c 100644
--- a/parsing/builtin_attributes.ml
+++ b/parsing/builtin_attributes.ml
@@ -303,3 +303,49 @@ let has_unboxed attr =
 
 let has_boxed attr =
   List.exists (check ["ocaml.boxed"; "boxed"]) attr
+
+
+let find_shapes attrs : Misc.named_shape list option =
+  let err loc msg =
+    warn_payload loc "shape" msg;
+    None
+  in
+  let shape_of_exp exp =
+    let shapes = [
+      "int", `Int;
+      "lazy", `Lazy;
+      "closure", `Closure;
+      "infix", `Infix;
+      "forward", `Forward;
+      "abstract", `Abstract;
+      "string", `String;
+      "double", `Double;
+      "double_array", `Double_array;
+      "custom", `Custom;
+    ] in
+    let loc = exp.pexp_loc in
+    match exp.pexp_desc with
+    | Pexp_ident {loc; txt = Longident.Lident shape_name} ->
+        begin match List.assoc_opt shape_name shapes with
+        | Some tag -> Some tag
+        | None ->
+            Printf.ksprintf (err loc) "Unknown shape name %s." shape_name
+        end
+    | _ ->
+        err loc "Unsupported shape format"
+  in
+  let shape_of_payload loc payload =
+    match list_of_payload loc payload with
+    | Error loc -> err loc "A shape list such as [int; lazy; custom] was expected."
+    | Ok args -> Some (List.filter_map shape_of_exp args)
+  in
+  let find_shape_payload a =
+    match a.attr_name.txt with
+    | "shape" | "ocaml.shape" ->
+        shape_of_payload a.attr_loc a.attr_payload
+    | _ -> None
+  in
+  match List.filter_map find_shape_payload attrs with
+  | [] -> None
+  | (_ :: _) as shape_specs ->
+      Some (List.fold_left List.rev_append [] shape_specs)
diff --git a/parsing/builtin_attributes.mli b/parsing/builtin_attributes.mli
index 6200fd74ec54..15d3e32924c8 100644
--- a/parsing/builtin_attributes.mli
+++ b/parsing/builtin_attributes.mli
@@ -82,3 +82,5 @@ val immediate64: Parsetree.attributes -> bool
 
 val has_unboxed: Parsetree.attributes -> bool
 val has_boxed: Parsetree.attributes -> bool
+
+val find_shapes: Parsetree.attributes -> Misc.named_shape list option
diff --git a/utils/misc.ml b/utils/misc.ml
index 942061720d85..9ed1dc921b62 100644
--- a/utils/misc.ml
+++ b/utils/misc.ml
@@ -1124,3 +1124,18 @@ module Magic_number = struct
            | Error err -> Error (Unexpected_error err)
            | Ok () -> Ok info
 end
+
+
+(* shapes that the user can name in [@shape ...] payloads *)
+type named_shape = [
+  | `Int (* all immediate shapes *)
+  | `Lazy (* Obj.lazy_tag, etc. *)
+  | `Closure
+  | `Infix
+  | `Forward
+  | `Abstract
+  | `String
+  | `Double
+  | `Double_array
+  | `Custom
+]
diff --git a/utils/misc.mli b/utils/misc.mli
index 6aea772091a9..fea7f2a108ca 100644
--- a/utils/misc.mli
+++ b/utils/misc.mli
@@ -671,3 +671,18 @@ module Magic_number : sig
 
   val all_kinds : kind list
 end
+
+
+(* shapes that the user can name in [@shape ...] payloads *)
+type named_shape = [
+  | `Int (* all immediate shapes *)
+  | `Lazy (* Obj.lazy_tag, etc. *)
+  | `Closure
+  | `Infix
+  | `Forward
+  | `Abstract
+  | `String
+  | `Double
+  | `Double_array
+  | `Custom
+]

From dd3313467630c37fb025858f987eb5d7397b9e54 Mon Sep 17 00:00:00 2001
From: Gabriel Scherer <gabriel.scherer@gmail.com>
Date: Mon, 13 Jun 2022 15:39:03 +0200
Subject: [PATCH 03/10] typedecl_unboxed: support for [@shape ...]

---
 typing/typedecl_unboxed.ml | 59 +++++++++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 14 deletions(-)

diff --git a/typing/typedecl_unboxed.ml b/typing/typedecl_unboxed.ml
index 059a206f5b9a..add789cb0eb2 100644
--- a/typing/typedecl_unboxed.ml
+++ b/typing/typedecl_unboxed.ml
@@ -170,6 +170,13 @@ module Head_shape = struct
     head_separated = true;
   }
 
+  let any_imm_shape =
+  {
+    head_imm = Shape_any;
+    head_blocks = Shape_set [];
+    head_separated = true;
+  }
+
   let block_shape tags =
     let imm = Shape_set [] in
     let blocks = Shape_set tags in
@@ -189,16 +196,32 @@ module Head_shape = struct
          true);
     }
 
-  let disjoint_union hd1 hd2 =
+  let of_named : Misc.named_shape -> t = function
+    | `Int -> any_imm_shape
+    | `Lazy -> block_shape [Obj.lazy_tag]
+    | `Closure -> block_shape [Obj.closure_tag]
+    | `Infix -> block_shape [Obj.infix_tag]
+    | `Forward -> block_shape [Obj.forward_tag]
+    | `Abstract -> block_shape [Obj.abstract_tag]
+    | `String -> block_shape [Obj.string_tag]
+    | `Double -> block_shape [Obj.double_tag]
+    | `Double_array -> block_shape [Obj.double_array_tag]
+    | `Custom -> block_shape [Obj.custom_tag]
+
+  let union ~disjoint hd1 hd2 =
+    let conflict_or k =
+      if disjoint then raise Conflict;
+      k ()
+    in
     let union s1 s2 = match s1, s2 with
     | Shape_any, Shape_set [] | Shape_set [], Shape_any -> Shape_any
-    | Shape_any, _ | _, Shape_any -> raise Conflict
+    | Shape_any, _ | _, Shape_any -> conflict_or (fun () -> Shape_any)
     | Shape_set l1, Shape_set l2 ->
         (* invariant : l1 and l2 are sorted *)
         let rec merge l1 l2 = match l1, l2 with
         | [], l | l, [] -> l
         | x :: xx, y :: yy ->
-            if x = y then raise Conflict
+            if x = y then conflict_or (fun () -> x :: merge xx yy)
             else if x < y then x :: (merge xx l2)
             else y :: (merge l1 yy)
         in Shape_set (merge l1 l2)
@@ -213,6 +236,9 @@ module Head_shape = struct
     in
     { head_imm; head_blocks; head_separated }
 
+  let disjoint_union hd1 hd2 =
+    union ~disjoint:true hd1 hd2
+
   module Callstack = struct
     type t = Path.t list
 
@@ -269,26 +295,21 @@ module Head_shape = struct
   | _ -> ty
 
   let of_primitive_type = function
-    | Int ->
-        {
-          head_imm = Shape_any;
-          head_blocks = Shape_set [];
-          head_separated = true;
-         }
-    | Float -> block_shape [Obj.double_tag]
+    | Int -> of_named `Int
+    | Float -> of_named `Double
     | String
-    | Bytes -> block_shape [Obj.string_tag]
+    | Bytes -> of_named `String
     | Array ->
         block_shape
           (if Config.flat_float_array then [0]
            else [0; Obj.double_array_tag])
-    | Floatarray -> block_shape [Obj.double_array_tag]
+    | Floatarray -> of_named `Double_array
     | Lazy -> any_shape
     (* Lazy values can 'shortcut' the lazy block, and thus have many
        different tags. When Config.flat_float_array, they
        cannot be floats, so we might want to refine that if there
        are strong use-cases. *)
-    | Custom -> block_shape [Obj.custom_tag]
+    | Custom -> of_named `Custom
 
   let rec of_type_expr env ty callstack_map =
     (* TODO : try the Ctype.expand_head_opt version here *)
@@ -351,8 +372,18 @@ module Head_shape = struct
     match ty_descr with
     | Type_abstract ->
        begin match ty_decl.type_manifest with
-       | None -> any_shape
        | Some ty -> of_type_expr_with_params ty
+       | None ->
+           begin match Builtin_attributes.find_shapes ty_decl.type_attributes with
+           | None -> any_shape
+           | Some named_shapes ->
+               let shapes = List.map of_named named_shapes in
+               try List.fold_left disjoint_union empty_shape shapes
+               with Conflict ->
+                 (* TODO use a proper error here as well *)
+                 Location.raise_errorf ~loc:ty_decl.type_loc ~sub:[]
+                   "The [@shape ...] attribute(s) contains conflicting shapes."
+           end
        end
     | Type_record (_, Record_regular) -> block_shape [0]
     | Type_record (_, Record_float) -> block_shape [Obj.double_array_tag]

From 5e26a4503e5b7d144062c7f08b5c443d17e7ddc3 Mon Sep 17 00:00:00 2001
From: Gabriel Scherer <gabriel.scherer@gmail.com>
Date: Mon, 13 Jun 2022 15:39:37 +0200
Subject: [PATCH 04/10] fixup! document typedecl_unboxed.ml{,i}

---
 typing/typedecl_unboxed.mli | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/typing/typedecl_unboxed.mli b/typing/typedecl_unboxed.mli
index f86db2669903..b544960b93a8 100644
--- a/typing/typedecl_unboxed.mli
+++ b/typing/typedecl_unboxed.mli
@@ -41,7 +41,7 @@ module Head_shape : sig
 
   val pp : Format.formatter -> t -> unit
 
-  (** Check a new type decalaration, that may be a variant type
+  (** Check a new type declaration, that may be a variant type
       containing unboxed constructors, to verify that the unboxing
       requests respect the "disjointness" requirement of constructor
       unboxing -- the values of two constructors must not conflict.

From e37aed4dadbb7975c3ec5d5aae61085a1f5f7917 Mon Sep 17 00:00:00 2001
From: Gabriel Scherer <gabriel.scherer@gmail.com>
Date: Wed, 30 Jun 2021 22:05:03 +0200
Subject: [PATCH 05/10] import Zarith C code

---
 Zarith/caml_z.c | 3440 +++++++++++++++++++++++++++++++++++++++++++++++
 Zarith/zarith.h |   46 +
 2 files changed, 3486 insertions(+)
 create mode 100644 Zarith/caml_z.c
 create mode 100644 Zarith/zarith.h

diff --git a/Zarith/caml_z.c b/Zarith/caml_z.c
new file mode 100644
index 000000000000..63283e647b09
--- /dev/null
+++ b/Zarith/caml_z.c
@@ -0,0 +1,3440 @@
+/* This file is cloned from
+   https://github.com/ocaml/Zarith/blob/39df015/caml_z.c
+*/
+
+/**
+  Implementation of Z module.
+
+
+  This file is part of the Zarith library
+  http://forge.ocamlcore.org/projects/zarith .
+  It is distributed under LGPL 2 licensing, with static linking exception.
+  See the LICENSE file included in the distribution.
+
+  Copyright (c) 2010-2011 Antoine Miné, Abstraction project.
+  Abstraction is part of the LIENS (Laboratoire d'Informatique de l'ENS),
+  a joint laboratory by:
+  CNRS (Centre national de la recherche scientifique, France),
+  ENS (École normale supérieure, Paris, France),
+  INRIA Rocquencourt (Institut national de recherche en informatique, France).
+
+*/
+
+
+/*---------------------------------------------------
+  INCLUDES
+  ---------------------------------------------------*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdint.h>
+#include <limits.h>
+
+#ifdef HAS_GMP
+#include <gmp.h>
+#endif
+#ifdef HAS_MPIR
+#include <mpir.h>
+#endif
+
+#include "zarith.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <caml/mlvalues.h>
+#include <caml/memory.h>
+#include <caml/alloc.h>
+#include <caml/fail.h>
+#include <caml/custom.h>
+#include <caml/intext.h>
+#include <caml/callback.h>
+#include <caml/intext.h>
+#include <caml/hash.h>
+
+#define inline __inline
+
+#ifdef _MSC_VER
+#include <float.h>
+#include <intrin.h>
+#endif
+
+/* The "__has_builtin" special macro from Clang */
+#ifdef __has_builtin
+#define HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define HAS_BUILTIN(x) 0
+#endif
+
+/*---------------------------------------------------
+  CONFIGURATION
+  ---------------------------------------------------*/
+
+/* Whether to enable native (i.e. non-mpn_) operations and output
+   ocaml integers when possible.
+   Highly recommended.
+ */
+#define Z_FAST_PATH  1
+#define Z_USE_NATINT 1
+
+/* Whether the fast path (arguments and result are small integers)
+   has already be handled in OCaml, so that there is no need to
+   re-test for it in C functions.
+   Applies to: neg, abs, add, sub, mul, div, rem, succ, pred,
+   logand, logor, logxor, lognot, shifts, divexact.
+*/
+#define Z_FAST_PATH_IN_OCAML 1
+
+/* Sanity checks. */
+#define Z_PERFORM_CHECK 0
+
+/* Enable performance counters.
+   Prints some info on stdout at exit.
+*/
+/*
+  #define Z_PERF_COUNTER 0
+  now set by configure
+*/
+
+/* whether to use custom blocks (supporting serialization, comparison &
+   hashing) instead of abstract tags
+*/
+#define Z_CUSTOM_BLOCK 1
+
+/*---------------------------------------------------
+  DATA STRUCTURES
+  ---------------------------------------------------*/
+
+/*
+   we assume that:
+   - intnat is a signed integer type
+   - mp_limb_t is an unsigned integer type
+   - sizeof(intnat) == sizeof(mp_limb_t) == either 4 or 8
+*/
+
+#ifdef _WIN64
+#define PRINTF_LIMB "I64"
+#else
+#define PRINTF_LIMB "l"
+#endif
+
+/*
+  A z object x can be:
+  - either an ocaml int
+  - or a block with abstract or custom tag and containing:
+    . a 1 value header containing the sign Z_SIGN(x) and the size Z_SIZE(x)
+    . Z_SIZE(x) mp_limb_t
+
+  Invariant:
+  - if the number fits in an int, it is stored in an int, not a block
+  - if the number is stored in a block, then Z_SIZE(x) >= 1 and
+  the most significant limb Z_LIMB(x)[Z_SIZE(x)] is not 0
+ */
+
+
+/* a sign is always denoted as 0 (+) or Z_SIGN_MASK (-) */
+#ifdef ARCH_SIXTYFOUR
+#define Z_SIGN_MASK 0x8000000000000000
+#define Z_SIZE_MASK 0x7fffffffffffffff
+#else
+#define Z_SIGN_MASK 0x80000000
+#define Z_SIZE_MASK 0x7fffffff
+#endif
+
+#if Z_CUSTOM_BLOCK
+#define Z_HEAD(x)   (*((value*)Data_custom_val((x))))
+#define Z_LIMB(x)   ((mp_limb_t*)Data_custom_val((x)) + 1)
+#else
+#define Z_HEAD(x)   (Field((x),0))
+#define Z_LIMB(x)   ((mp_limb_t*)&(Field((x),1)))
+#endif
+#define Z_SIGN(x)   (Z_HEAD((x)) & Z_SIGN_MASK)
+#define Z_SIZE(x)   (Z_HEAD((x)) & Z_SIZE_MASK)
+
+/* bounds of an Ocaml int */
+#ifdef ARCH_SIXTYFOUR
+#define Z_MAX_INT       0x3fffffffffffffff
+#define Z_MIN_INT     (-0x4000000000000000)
+#else
+#define Z_MAX_INT       0x3fffffff
+#define Z_MIN_INT     (-0x40000000)
+#endif
+#define Z_FITS_INT(v)  ((v) >= Z_MIN_INT && (v) <= Z_MAX_INT)
+
+/* Z_MAX_INT may not be representable exactly as a double => we use a
+   lower approximation to be safe
+ */
+#ifdef ARCH_SIXTYFOUR
+#define Z_MAX_INT_FL    0x3ffffffffffff000
+#define Z_MIN_INT_FL    (-Z_MAX_INT_FL)
+#else
+#define Z_MAX_INT_FL    Z_MAX_INT
+#define Z_MIN_INT_FL    Z_MIN_INT
+#endif
+
+/* safe bounds to avoid overflow in multiplication */
+#ifdef ARCH_SIXTYFOUR
+#define Z_MAX_HINT  0x3fffffff
+#else
+#define Z_MAX_HINT  0x3fff
+#endif
+#define Z_MIN_HINT (-Z_MAX_HINT)
+#define Z_FITS_HINT(v) ((v) >= Z_MIN_HINT && (v) <= Z_MAX_HINT)
+
+/* hi bit of OCaml int32, int64 & nativeint */
+#define Z_HI_INT32   0x80000000
+#define Z_HI_INT64   0x8000000000000000LL
+#ifdef ARCH_SIXTYFOUR
+#define Z_HI_INTNAT  Z_HI_INT64
+#define Z_HI_INT     0x4000000000000000
+#else
+#define Z_HI_INTNAT  Z_HI_INT32
+#define Z_HI_INT     0x40000000
+#endif
+
+/* safe bounds for the length of a base n string fitting in a native
+   int. Defined as the result of (n - 2) log_base(2) with n = 64 or
+   32.
+*/
+#ifdef ARCH_SIXTYFOUR
+#define Z_BASE16_LENGTH_OP 15
+#define Z_BASE10_LENGTH_OP 18
+#define Z_BASE8_LENGTH_OP 20
+#define Z_BASE2_LENGTH_OP 62
+#else
+#define Z_BASE16_LENGTH_OP 7
+#define Z_BASE10_LENGTH_OP 9
+#define Z_BASE8_LENGTH_OP 10
+#define Z_BASE2_LENGTH_OP 30
+#endif
+
+#define Z_LIMB_BITS  (8 * sizeof(mp_limb_t))
+
+
+/* performance counters */
+unsigned long ml_z_ops = 0;
+unsigned long ml_z_slow = 0;
+unsigned long ml_z_ops_as = 0;
+
+#if Z_PERF_COUNTER
+#define Z_MARK_OP    ml_z_ops++
+#define Z_MARK_SLOW  ml_z_slow++
+#else
+#define Z_MARK_OP
+#define Z_MARK_SLOW
+#endif
+
+/*---------------------------------------------------
+  UTILITIES
+  ---------------------------------------------------*/
+
+extern struct custom_operations ml_z_custom_ops;
+
+static double ml_z_2p32; /* 2 ^ 32 in double */
+
+#if  Z_PERFORM_CHECK
+/* for debugging: dump a mp_limb_t array */
+static void ml_z_dump(const char* msg, mp_limb_t* p, mp_size_t sz)
+{
+  mp_size_t i;
+  printf("%s %i: ",msg,(int)sz);
+  for (i = 0; i < sz; i++)
+#ifdef ARCH_SIXTYFOUR
+    printf("%08" PRINTF_LIMB "x ",p[i]);
+#else
+    printf("%04" PRINTF_LIMB "x ",p[i]);
+#endif
+  printf("\n");
+  fflush(stdout);
+}
+#endif
+
+#if Z_PERFORM_CHECK
+/* for debugging: check invariant */
+void ml_z_check(const char* fn, int line, const char* arg, value v)
+{
+  mp_size_t sz;
+
+  if (Is_long(v)) {
+#if Z_USE_NATINT
+    return;
+#else
+    printf("ml_z_check: unexpected tagged integer for %s at %s:%i.\n", arg, fn, line);
+    exit(1);
+#endif
+  }
+#if Z_CUSTOM_BLOCK
+  if (Custom_ops_val(v) != &ml_z_custom_ops) {
+    printf("ml_z_check: wrong custom block for %s at %s:%i.\n",
+           arg, fn, line);
+    exit(1);
+  }
+  sz = Wosize_val(v) - 1;
+#else
+  sz = Wosize_val(v);
+#endif
+  if (Z_SIZE(v) + 2 > sz) {
+    printf("ml_z_check: invalid block size (%i / %i) for %s at %s:%i.\n",
+           (int)Z_SIZE(v), (int)sz,
+           arg, fn, line);
+    exit(1);
+  }
+  if ((mp_size_t) Z_LIMB(v)[sz - 2] != (mp_size_t)(0xDEADBEEF ^ (sz - 2))) {
+    printf("ml_z_check: corrupted block for %s at %s:%i.\n",
+           arg, fn, line);
+    exit(1);
+  }
+  if (Z_SIZE(v) && !Z_LIMB(v)[Z_SIZE(v)-1]) {
+    printf("ml_z_check: unreduced argument for %s at %s:%i.\n", arg, fn, line);
+    ml_z_dump("offending argument: ", Z_LIMB(v), Z_SIZE(v));
+    exit(1);
+  }
+#if Z_USE_NATINT
+  if (Z_SIZE(v) == 0
+      || (Z_SIZE(v) <= 1
+          && (Z_LIMB(v)[0] <= Z_MAX_INT
+              || (Z_LIMB(v)[0] == -Z_MIN_INT && Z_SIGN(v))))) {
+    printf("ml_z_check: expected a tagged integer for %s at %s:%i.\n", arg, fn, line);
+    ml_z_dump("offending argument: ", Z_LIMB(v), Z_SIZE(v));
+    exit(1);
+  }
+#else
+  if (!Z_SIZE(v) && Z_SIGN(v)) {
+    printf("ml_z_check: invalid sign of 0 for %s at %s:%i.\n",
+           arg, fn, line);
+    exit(1);
+  }
+#endif
+}
+#endif
+
+/* for debugging */
+#if Z_PERFORM_CHECK
+#define Z_CHECK(v) ml_z_check(__FUNCTION__, __LINE__, #v, v)
+#else
+#define Z_CHECK(v)
+#endif
+
+/* allocates z object block with space for sz mp_limb_t;
+   does not set the header
+ */
+
+#if !Z_PERFORM_CHECK
+/* inlined allocation */
+#if Z_CUSTOM_BLOCK
+#define ml_z_alloc(sz) \
+  caml_alloc_custom(&ml_z_custom_ops, (1 + (sz)) * sizeof(value), 0, 1)
+#else
+#define ml_z_alloc(sz) \
+  caml_alloc(1 + (sz), Abstract_tag);
+#endif
+
+#else
+/* out-of-line allocation, inserting a canary after the last limb */
+static value ml_z_alloc(mp_size_t sz)
+{
+  value v;
+#if Z_CUSTOM_BLOCK
+  v = caml_alloc_custom(&ml_z_custom_ops, (1 + sz + 1) * sizeof(value), 0, 1);
+#else
+  v = caml_alloc(1 + sz + 1, Abstract_tag);
+#endif
+  Z_LIMB(v)[sz] = 0xDEADBEEF ^ sz;
+  return v;
+}
+#endif
+
+/* duplicates the caml block src */
+static inline void ml_z_cpy_limb(mp_limb_t* dst, mp_limb_t* src, mp_size_t sz)
+{
+  memcpy(dst, src, sz * sizeof(mp_limb_t));
+}
+
+/* duplicates the mp_limb_t array src */
+static inline mp_limb_t* ml_z_dup_limb(mp_limb_t* src, mp_size_t sz)
+{
+  mp_limb_t* r = (mp_limb_t*) malloc(sz * sizeof(mp_limb_t));
+  memcpy(r, src, sz * sizeof(mp_limb_t));
+  return r;
+}
+
+
+#ifdef _MSC_VER
+#define MAYBE_UNUSED
+#else
+#define MAYBE_UNUSED (void)
+#endif
+
+/* given a z object, define:
+   - ptr_arg: a pointer to the first mp_limb_t
+   - size_arg: the number of mp-limb_t
+   - sign_arg: the sign of the number
+   if arg is an int, it is converted to a 1-limb number
+*/
+#define Z_DECL(arg)                                                     \
+  mp_limb_t loc_##arg, *ptr_##arg;                                      \
+  mp_size_t size_##arg;                                                 \
+  intnat sign_##arg;                                                    \
+  MAYBE_UNUSED loc_##arg;                                                      \
+  MAYBE_UNUSED ptr_##arg;                                                      \
+  MAYBE_UNUSED size_##arg;                                                     \
+  MAYBE_UNUSED sign_##arg;
+
+#define Z_ARG(arg)                                                      \
+  if (Is_long(arg)) {                                                   \
+    intnat n = Long_val(arg);                                           \
+    loc_##arg = n < 0 ? -n : n;                                         \
+    sign_##arg = n & Z_SIGN_MASK;                                       \
+    size_##arg = n != 0;                                                \
+    ptr_##arg = &loc_##arg;                                             \
+  }                                                                     \
+  else {                                                                \
+    size_##arg = Z_SIZE(arg);                                           \
+    sign_##arg = Z_SIGN(arg);                                           \
+    ptr_##arg = Z_LIMB(arg);                                            \
+  }
+
+/* After an allocation, a heap-allocated Z argument may have moved and
+  its ptr_arg pointer can be invalid.  Reset the ptr_arg pointer to
+  its correct value. */
+
+#define Z_REFRESH(arg) \
+  if (! Is_long(arg)) ptr_##arg = Z_LIMB(arg);
+
+/* computes the actual size of the z object r and updates its header,
+   either returns r or, if the number is small enough, an int
+ */
+static value ml_z_reduce(value r, mp_size_t sz, intnat sign)
+{
+  while (sz > 0 && !Z_LIMB(r)[sz-1]) sz--;
+#if Z_USE_NATINT
+  if (!sz) return Val_long(0);
+  if (sz <= 1) {
+    if (Z_LIMB(r)[0] <= Z_MAX_INT) {
+      if (sign) return Val_long(-Z_LIMB(r)[0]);
+      else return Val_long(Z_LIMB(r)[0]);
+    }
+    if (Z_LIMB(r)[0] == -Z_MIN_INT && sign) {
+      return Val_long(Z_MIN_INT);
+    }
+  }
+#else
+  if (!sz) sign = 0;
+#endif
+  Z_HEAD(r) = sz | sign;
+  return r;
+}
+
+static void ml_z_raise_overflow()
+{
+  caml_raise_constant(*caml_named_value("ml_z_overflow"));
+}
+
+#define ml_z_raise_divide_by_zero() \
+  caml_raise_zero_divide()
+
+
+
+/*---------------------------------------------------
+  CONVERSION FUNCTIONS
+  ---------------------------------------------------*/
+
+CAMLprim value ml_z_of_int(value v)
+{
+#if Z_USE_NATINT
+  Z_MARK_OP;
+  return v;
+#else
+  intnat x;
+  value r;
+  Z_MARK_OP;
+  Z_MARK_SLOW;
+  x = Long_val(v);
+  r =  ml_z_alloc(1);
+  if (x > 0) { Z_HEAD(r) = 1; Z_LIMB(r)[0] = x; }
+  else if (x < 0) { Z_HEAD(r) = 1 | Z_SIGN_MASK; Z_LIMB(r)[0] = -x; }
+  else Z_HEAD(r) = 0;
+  Z_CHECK(r);
+  return r;
+#endif
+}
+
+CAMLprim value ml_z_of_nativeint(value v)
+{
+  intnat x;
+  value r;
+  Z_MARK_OP;
+  x = Nativeint_val(v);
+#if Z_USE_NATINT
+  if (Z_FITS_INT(x)) return Val_long(x);
+#endif
+  Z_MARK_SLOW;
+  r = ml_z_alloc(1);
+  if (x > 0) { Z_HEAD(r) = 1; Z_LIMB(r)[0] = x; }
+  else if (x < 0) { Z_HEAD(r) = 1 | Z_SIGN_MASK; Z_LIMB(r)[0] = -x; }
+  else Z_HEAD(r) = 0;
+  Z_CHECK(r);
+  return r;
+}
+
+CAMLprim value ml_z_of_int32(value v)
+{
+  int32_t x;
+  Z_MARK_OP;
+  x = Int32_val(v);
+#if Z_USE_NATINT && defined(ARCH_SIXTYFOUR)
+  return Val_long(x);
+#else
+#if Z_USE_NATINT
+  if (Z_FITS_INT(x)) return Val_long(x);
+#endif
+  {
+    value r;
+    Z_MARK_SLOW;
+    r = ml_z_alloc(1);
+    if (x > 0) { Z_HEAD(r) = 1; Z_LIMB(r)[0] = x; }
+    else if (x < 0) { Z_HEAD(r) = 1 | Z_SIGN_MASK; Z_LIMB(r)[0] = -(mp_limb_t)x; }
+    else Z_HEAD(r) = 0;
+    Z_CHECK(r);
+    return r;
+  }
+#endif
+}
+
+CAMLprim value ml_z_of_int64(value v)
+{
+  int64_t x;
+  value r;
+  Z_MARK_OP;
+  x = Int64_val(v);
+#if Z_USE_NATINT
+  if (Z_FITS_INT(x)) return Val_long(x);
+#endif
+  Z_MARK_SLOW;
+#ifdef ARCH_SIXTYFOUR
+  r = ml_z_alloc(1);
+  if (x > 0) { Z_HEAD(r) = 1; Z_LIMB(r)[0] = x; }
+  else if (x < 0) { Z_HEAD(r) = 1 | Z_SIGN_MASK; Z_LIMB(r)[0] = -x; }
+  else Z_HEAD(r) = 0;
+#else
+  {
+  mp_limb_t sign;
+  r = ml_z_alloc(2);
+  if (x >= 0) { sign = 0; }
+  else { sign = Z_SIGN_MASK; x = -x; }
+  Z_LIMB(r)[0] = x;
+  Z_LIMB(r)[1] = x >> 32;
+  r = ml_z_reduce(r, 2, sign);
+  }
+#endif
+  Z_CHECK(r);
+  return r;
+}
+
+CAMLprim value ml_z_of_float(value v)
+{
+  double x;
+  int exp;
+  int64_t y, m;
+  value r;
+  Z_MARK_OP;
+  x = Double_val(v);
+#if Z_USE_NATINT
+  if (x >= Z_MIN_INT_FL && x <= Z_MAX_INT_FL) return Val_long((intnat) x);
+#endif
+  Z_MARK_SLOW;
+#ifdef ARCH_ALIGN_INT64
+  memcpy(&y, (void *) v, 8);
+#else
+  y = *((int64_t*)v);
+#endif
+  exp = ((y >> 52) & 0x7ff) - 1023; /* exponent */
+  if (exp < 0) return(Val_long(0));
+  if (exp == 1024) ml_z_raise_overflow();  /* NaN or infinity */
+  m = (y & 0x000fffffffffffffLL) | 0x0010000000000000LL; /* mantissa */
+  if (exp <= 52) {
+    m >>= 52-exp;
+#ifdef ARCH_SIXTYFOUR
+    r = Val_long((x >= 0.) ? m : -m);
+#else
+    r = ml_z_alloc(2);
+    Z_LIMB(r)[0] = m;
+    Z_LIMB(r)[1] = m >> 32;
+    r = ml_z_reduce(r, 2, (x >= 0.) ? 0 : Z_SIGN_MASK);
+#endif
+  }
+  else {
+    int c1 = (exp-52) / Z_LIMB_BITS;
+    int c2 = (exp-52) % Z_LIMB_BITS;
+    mp_size_t i;
+#ifdef ARCH_SIXTYFOUR
+    r = ml_z_alloc(c1 + 2);
+    for (i = 0; i < c1; i++) Z_LIMB(r)[i] = 0;
+    Z_LIMB(r)[c1] = m << c2;
+    Z_LIMB(r)[c1+1] = c2 ? (m >> (64-c2)) : 0;
+    r = ml_z_reduce(r, c1 + 2, (x >= 0.) ? 0 : Z_SIGN_MASK);
+#else
+    r = ml_z_alloc(c1 + 3);
+    for (i = 0; i < c1; i++) Z_LIMB(r)[i] = 0;
+    Z_LIMB(r)[c1] = m << c2;
+    Z_LIMB(r)[c1+1] = m >> (32-c2);
+    Z_LIMB(r)[c1+2] = c2 ? (m >> (64-c2)) : 0;
+    r = ml_z_reduce(r, c1 + 3, (x >= 0.) ? 0 : Z_SIGN_MASK);
+#endif
+  }
+  Z_CHECK(r);
+  return r;
+}
+
+CAMLprim value ml_z_of_substring_base(value b, value v, value offset, value length)
+{
+  CAMLparam1(v);
+  CAMLlocal1(r);
+  intnat ofs = Long_val(offset);
+  intnat len = Long_val(length);
+  /* make sure the ofs/length make sense */
+  if (ofs < 0
+      || len < 0
+      || (intnat)caml_string_length(v) < ofs + len)
+    caml_invalid_argument("Z.of_substring_base: invalid offset or length");
+  /* process the string */
+  const char *d = String_val(v) + ofs;
+  const char *end = d + len;
+  mp_size_t i, j, sz, sz2, num_digits = 0;
+  mp_limb_t sign = 0;
+  intnat base = Long_val(b);
+  /* We allow [d] to advance beyond [end] while parsing the prefix:
+     sign, base, and/or leading zeros.
+     This simplifies the code, and reading these locations is safe since
+     we don't progress beyond a terminating null character.
+     At the end of the prefix, if we ran past the end, we return 0.
+  */
+  /* get optional sign */
+  if (*d == '-') { sign ^= Z_SIGN_MASK; d++; }
+  if (*d == '+') d++;
+  /* get optional base */
+  if (!base) {
+    base = 10;
+    if (*d == '0') {
+      d++;
+      if (*d == 'o' || *d == 'O') { base = 8; d++; }
+      else if (*d == 'x' || *d == 'X') { base = 16; d++; }
+      else if (*d == 'b' || *d == 'B') { base = 2; d++; }
+      else {
+        /* The leading zero is not part of a base prefix. This is an
+           important distinction for the check below looking at
+           leading underscore
+         */
+        d--; }
+    }
+  }
+  if (base < 2 || base > 16)
+    caml_invalid_argument("Z.of_substring_base: base must be between 2 and 16");
+  /* we do not allow leading underscore */
+  if (*d == '_')
+    caml_invalid_argument("Z.of_substring_base: invalid digit");
+  while (*d == '0' || *d == '_') d++;
+  /* sz is the length of the substring that has not been consumed above. */
+  sz = end - d;
+  for(i = 0; i < sz; i++){
+    /* underscores are going to be ignored below. Assuming the string
+       is well formatted, this will give us the exact number of digits */
+    if(d[i] != '_') num_digits++;
+  }
+#if Z_USE_NATINT
+  if (sz <= 0) {
+    /* "+", "-", "0x" are parsed as 0. */
+    r = Val_long(0);
+  }
+  /* Process common case (fits into a native integer) */
+  else if ((base == 10 && num_digits <= Z_BASE10_LENGTH_OP)
+        || (base == 16 && num_digits <= Z_BASE16_LENGTH_OP)
+        || (base == 8  && num_digits <= Z_BASE8_LENGTH_OP)
+        || (base == 2  && num_digits <= Z_BASE2_LENGTH_OP)) {
+      Z_MARK_OP;
+      intnat ret = 0;
+      for (i = 0; i < sz; i++) {
+        int digit = 0;
+        if (d[i] == '_') continue;
+        if (d[i] >= '0' && d[i] <= '9') digit = d[i] - '0';
+        else if (d[i] >= 'a' && d[i] <= 'f') digit = d[i] - 'a' + 10;
+        else if (d[i] >= 'A' && d[i] <= 'F') digit = d[i] - 'A' + 10;
+        else caml_invalid_argument("Z.of_substring_base: invalid digit");
+        if (digit >= base)
+          caml_invalid_argument("Z.of_substring_base: invalid digit");
+        ret = ret * base + digit;
+      }
+      r = Val_long(ret * (sign ? -1 : 1));
+  } else
+#endif
+  {
+     /* converts to sequence of digits */
+    char* digits = (char*)malloc(num_digits+1);
+    for (i = 0, j = 0; i < sz; i++) {
+      if (d[i] == '_') continue;
+      if (d[i] >= '0' && d[i] <= '9') digits[j] = d[i] - '0';
+      else if (d[i] >= 'a' && d[i] <= 'f') digits[j] = d[i] - 'a' + 10;
+      else if (d[i] >= 'A' && d[i] <= 'F') digits[j] = d[i] - 'A' + 10;
+      else {
+        free(digits);
+        caml_invalid_argument("Z.of_substring_base: invalid digit");
+      }
+      if (digits[j] >= base) {
+        free(digits);
+        caml_invalid_argument("Z.of_substring_base: invalid digit");
+      }
+      j++;
+    }
+    /* make sure that digits is nul terminated */
+    digits[j] = 0;
+    r = ml_z_alloc(1 + j / (2 * sizeof(mp_limb_t)));
+    sz2 = mpn_set_str(Z_LIMB(r), (unsigned char*)digits, j, base);
+    r = ml_z_reduce(r, sz2, sign);
+    free(digits);
+  }
+  Z_CHECK(r);
+  CAMLreturn(r);
+}
+
+CAMLprim value ml_z_to_int(value v)
+{
+  intnat x;
+  Z_DECL(v);
+  Z_MARK_OP;
+  Z_CHECK(v);
+  if (Is_long(v)) return v;
+  Z_MARK_SLOW;
+  Z_ARG(v);
+  if (size_v > 1) ml_z_raise_overflow();
+  if (!size_v) return Val_long(0);
+  x = *ptr_v;
+  if (sign_v) {
+    if ((uintnat)x > Z_HI_INT) ml_z_raise_overflow();
+    x = -x;
+  }
+  else {
+    if ((uintnat)x >= Z_HI_INT) ml_z_raise_overflow();
+  }
+  return Val_long(x);
+}
+
+CAMLprim value ml_z_to_nativeint(value v)
+{
+  intnat x;
+  Z_DECL(v);
+  Z_MARK_OP;
+  Z_CHECK(v);
+  if (Is_long(v)) return caml_copy_nativeint(Long_val(v));
+  Z_MARK_SLOW;
+  Z_ARG(v);
+  if (size_v > 1) ml_z_raise_overflow();
+  if (!size_v) x = 0;
+  else {
+    x = *ptr_v;
+    if (sign_v) {
+      if ((uintnat)x > Z_HI_INTNAT) ml_z_raise_overflow();
+      x = -x;
+    }
+    else {
+      if ((uintnat)x >= Z_HI_INTNAT) ml_z_raise_overflow();
+    }
+  }
+  return caml_copy_nativeint(x);
+}
+
+CAMLprim value ml_z_to_int32(value v)
+{
+  intnat x;
+  Z_DECL(v);
+  Z_MARK_OP;
+  Z_CHECK(v);
+  if (Is_long(v)) {
+    x = Long_val(v);
+#ifdef ARCH_SIXTYFOUR
+    if (x >= (intnat)Z_HI_INT32 || x < -(intnat)Z_HI_INT32)
+      ml_z_raise_overflow();
+#endif
+    return caml_copy_int32(x);
+  }
+  else {
+    Z_ARG(v);
+    Z_MARK_SLOW;
+    if (size_v > 1) ml_z_raise_overflow();
+    if (!size_v) x = 0;
+    else {
+      x = *ptr_v;
+      if (sign_v) {
+        if ((uintnat)x > Z_HI_INT32) ml_z_raise_overflow();
+        x = -x;
+      }
+      else {
+        if ((uintnat)x >= Z_HI_INT32) ml_z_raise_overflow();
+      }
+    }
+    return caml_copy_int32(x);
+  }
+}
+
+CAMLprim value ml_z_to_int64(value v)
+{
+  int64_t x = 0;
+  Z_DECL(v);
+  Z_MARK_OP;
+  Z_CHECK(v);
+  if (Is_long(v)) return caml_copy_int64(Long_val(v));
+  Z_MARK_SLOW;
+  Z_ARG(v);
+  switch (size_v) {
+  case 0: x = 0; break;
+  case 1: x = ptr_v[0]; break;
+#ifndef ARCH_SIXTYFOUR
+  case 2: x = ptr_v[0] | ((uint64_t)ptr_v[1] << 32); break;
+#endif
+  default: ml_z_raise_overflow(); break;
+  }
+  if (sign_v) {
+    if ((uint64_t)x > Z_HI_INT64) ml_z_raise_overflow();
+    x = -x;
+  }
+  else {
+    if ((uint64_t)x >= Z_HI_INT64) ml_z_raise_overflow();
+  }
+  return caml_copy_int64(x);
+}
+
+/* XXX: characters that do not belong to the format are ignored, this departs
+   from the classic printf behavior (it copies them in the output)
+ */
+CAMLprim value ml_z_format(value f, value v)
+{
+  CAMLparam2(f,v);
+  Z_DECL(v);
+  const char tab[2][16] =
+    { { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' },
+      { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' } };
+  char* buf, *dst;
+  mp_size_t i, size_dst, max_size;
+  value r;
+  const char* fmt = String_val(f);
+  int base = 10;     /* base */
+  int cas = 0;       /* uppercase X / lowercase x */
+  int width = 0;
+  int alt = 0;       /* alternate # */
+  int dir = 0;       /* right / left adjusted */
+  char sign = 0;     /* sign char */
+  char pad = ' ';    /* padding char */
+  char *prefix = "";
+  Z_MARK_OP;
+  Z_CHECK(v);
+  Z_ARG(v);
+  Z_MARK_SLOW;
+
+  /* parse format */
+  while (*fmt == '%') fmt++;
+  for (; ; fmt++) {
+    if (*fmt == '#') alt = 1;
+    else if (*fmt == '0') pad = '0';
+    else if (*fmt == '-') dir = 1;
+    else if (*fmt == ' ' || *fmt == '+') sign = *fmt;
+    else break;
+  }
+  if (sign_v) sign = '-';
+  for (;*fmt>='0' && *fmt<='9';fmt++)
+    width = 10*width + *fmt-'0';
+  switch (*fmt) {
+  case 'i': case 'd': case 'u': break;
+  case 'b': base = 2; if (alt) prefix = "0b"; break;
+  case 'o': base = 8; if (alt) prefix = "0o"; break;
+  case 'x': base = 16; if (alt) prefix = "0x"; cas = 1; break;
+  case 'X': base = 16; if (alt) prefix = "0X"; break;
+  default: caml_invalid_argument("Z.format: invalid format");
+  }
+  if (dir) pad = ' ';
+  /* get digits */
+  /*  we need space for sign + prefix + digits + 1 + padding + terminal 0 */
+  max_size = 1 + 2 + Z_LIMB_BITS * size_v + 1 + 2 * width + 1;
+  buf = (char*) malloc(max_size);
+  dst = buf + 1 + 2 + width;
+  if (!size_v) {
+    size_dst = 1;
+    *dst = '0';
+  }
+  else {
+    mp_limb_t* copy_v = ml_z_dup_limb(ptr_v, size_v);
+    size_dst = mpn_get_str((unsigned char*)dst, base, copy_v, size_v);
+    if (dst + size_dst >= buf + max_size)
+      caml_failwith("Z.format: internal error");
+    free(copy_v);
+    while (size_dst && !*dst) { dst++; size_dst--; }
+    for (i = 0; i < size_dst; i++)
+      dst[i] = tab[cas][ (int) dst[i] ];
+  }
+  /* add prefix, sign & padding */
+  if (pad == ' ') {
+    if (dir) {
+      /* left alignment */
+      for (i = strlen(prefix); i > 0; i--, size_dst++)
+        *(--dst) = prefix[i-1];
+      if (sign) { *(--dst) = sign; size_dst++; }
+      for (; size_dst < width; size_dst++)
+        dst[size_dst] = pad;
+    }
+    else {
+      /* right alignment, space padding */
+      for (i = strlen(prefix); i > 0; i--, size_dst++)
+        *(--dst) = prefix[i-1];
+      if (sign) { *(--dst) = sign; size_dst++; }
+      for (; size_dst < width; size_dst++) *(--dst) = pad;
+    }
+  }
+  else {
+    /* right alignment, non-space padding */
+    width -= strlen(prefix) + (sign ? 1 : 0);
+    for (; size_dst < width; size_dst++) *(--dst) = pad;
+    for (i = strlen(prefix); i > 0; i--, size_dst++)
+      *(--dst) = prefix[i-1];
+    if (sign) { *(--dst) = sign; size_dst++; }
+  }
+  dst[size_dst] = 0;
+  if (dst < buf || dst + size_dst >= buf + max_size)
+    caml_failwith("Z.format: internal error");
+  r = caml_copy_string(dst);
+  free(buf);
+  CAMLreturn(r);
+}
+
+#ifdef ARCH_SIXTYFOUR
+#define BITS_PER_WORD 64
+#else
+#define BITS_PER_WORD 32
+#endif
+
+CAMLprim value ml_z_extract(value arg, value off, value len)
+{
+  intnat o, l, x;
+  mp_size_t sz, c1, c2, csz, i;
+  mp_limb_t cr;
+  value r;
+  Z_DECL(arg);
+  Z_MARK_OP;
+  MAYBE_UNUSED x;
+  o = Long_val(off);
+  l = Long_val(len);
+  if (o < 0) caml_invalid_argument("Z.extract: negative bit offset");
+  if (l <= 0) caml_invalid_argument("Z.extract: nonpositive bit length");
+#if Z_USE_NATINT
+  /* Fast path */
+  if (Is_long(arg)) {
+    x = Long_val(arg);
+    /* Shift away low "o" bits.  If "o" too big, just replicate sign bit. */
+    if (o >= BITS_PER_WORD) o = BITS_PER_WORD - 1;
+    x = x >> o;
+    /* Extract "l" low bits, if "l" is small enough */
+    if (l < BITS_PER_WORD - 1) {
+      x = x & (((intnat)1 << l) - 1);
+      return Val_long(x);
+    } else {
+      /* If x >= 0, the extraction of "l" low bits keeps x unchanged. */
+      if (x >= 0) return Val_long(x);
+      /* If x < 0, fall through slow path */
+    }
+  }
+#endif
+  Z_MARK_SLOW;
+  {
+  CAMLparam1(arg);
+  Z_ARG(arg);
+  sz = (l + Z_LIMB_BITS - 1) / Z_LIMB_BITS;
+  r = ml_z_alloc(sz + 1);
+  Z_REFRESH(arg);
+  c1 = o / Z_LIMB_BITS;
+  c2 = o % Z_LIMB_BITS;
+  /* shift or copy */
+  csz = size_arg - c1;
+  if (csz > sz + 1) csz = sz + 1;
+  cr = 0;
+  if (csz > 0) {
+    if (c2) cr = mpn_rshift(Z_LIMB(r), ptr_arg + c1, csz, c2);
+    else ml_z_cpy_limb(Z_LIMB(r), ptr_arg + c1, csz);
+  }
+  else csz = 0;
+  /* 0-pad */
+  for (i = csz; i < sz; i++)
+    Z_LIMB(r)[i] = 0;
+  /* 2's complement */
+  if (sign_arg) {
+    for (i = 0; i < sz; i++)
+      Z_LIMB(r)[i] = ~Z_LIMB(r)[i];
+    /* carry (cr=0 if all shifted-out bits are 0) */
+    for (i = 0; !cr && i < c1 && i < size_arg; i++)
+      cr = ptr_arg[i];
+    if (!cr) mpn_add_1(Z_LIMB(r), Z_LIMB(r), sz, 1);
+  }
+  /* mask out high bits */
+  l %= Z_LIMB_BITS;
+  if (l) Z_LIMB(r)[sz-1] &= ((uintnat)(intnat)-1) >> (Z_LIMB_BITS - l);
+  r = ml_z_reduce(r, sz, 0);
+  CAMLreturn(r);
+  }
+}
+
+/* NOTE: the sign is not stored */
+CAMLprim value ml_z_to_bits(value arg)
+{
+  CAMLparam1(arg);
+  CAMLlocal1(r);
+  Z_DECL(arg);
+  mp_size_t i;
+  unsigned char* p;
+  Z_MARK_OP;
+  Z_MARK_SLOW;
+  Z_ARG(arg);
+  r = caml_alloc_string(size_arg * sizeof(mp_limb_t));
+  Z_REFRESH(arg);
+  p = (unsigned char*) String_val(r);
+  memset(p, 0, size_arg * sizeof(mp_limb_t));
+  for (i = 0; i < size_arg; i++) {
+    mp_limb_t x = ptr_arg[i];
+    *(p++) = x;
+    *(p++) = x >> 8;
+    *(p++) = x >> 16;
+    *(p++) = x >> 24;
+#ifdef ARCH_SIXTYFOUR
+    *(p++) = x >> 32;
+    *(p++) = x >> 40;
+    *(p++) = x >> 48;
+    *(p++) = x >> 56;
+#endif
+  }
+  CAMLreturn(r);
+}
+
+CAMLprim value ml_z_of_bits(value arg)
+{
+  CAMLparam1(arg);
+  CAMLlocal1(r);
+  mp_size_t sz, szw;
+  mp_size_t i = 0;
+  mp_limb_t x;
+  const unsigned char* p;
+  Z_MARK_OP;
+  Z_MARK_SLOW;
+  sz = caml_string_length(arg);
+  szw = (sz + sizeof(mp_limb_t) - 1) / sizeof(mp_limb_t);
+  r = ml_z_alloc(szw);
+  p = (const unsigned char*) String_val(arg);
+  /* all limbs but last */
+  if (szw > 1) {
+    for (; i < szw - 1; i++) {
+      x = *(p++);
+      x |= ((mp_limb_t) *(p++)) << 8;
+      x |= ((mp_limb_t) *(p++)) << 16;
+      x |= ((mp_limb_t) *(p++)) << 24;
+#ifdef ARCH_SIXTYFOUR
+      x |= ((mp_limb_t) *(p++)) << 32;
+      x |= ((mp_limb_t) *(p++)) << 40;
+      x |= ((mp_limb_t) *(p++)) << 48;
+      x |= ((mp_limb_t) *(p++)) << 56;
+#endif
+      Z_LIMB(r)[i] = x;
+    }
+    sz -= i * sizeof(mp_limb_t);
+  }
+  /* last limb */
+  if (sz > 0) {
+    x = *(p++);
+    if (sz > 1) x |= ((mp_limb_t) *(p++)) << 8;
+    if (sz > 2) x |= ((mp_limb_t) *(p++)) << 16;
+    if (sz > 3) x |= ((mp_limb_t) *(p++)) << 24;
+#ifdef ARCH_SIXTYFOUR
+    if (sz > 4) x |= ((mp_limb_t) *(p++)) << 32;
+    if (sz > 5) x |= ((mp_limb_t) *(p++)) << 40;
+    if (sz > 6) x |= ((mp_limb_t) *(p++)) << 48;
+    if (sz > 7) x |= ((mp_limb_t) *(p++)) << 56;
+#endif
+    Z_LIMB(r)[i] = x;
+  }
+  r = ml_z_reduce(r, szw, 0);
+  Z_CHECK(r);
+  CAMLreturn(r);
+}
+
+/*---------------------------------------------------
+  TESTS AND COMPARISONS
+  ---------------------------------------------------*/
+
+CAMLprim value ml_z_compare(value arg1, value arg2)
+{
+  int r;
+  Z_DECL(arg1); Z_DECL(arg2);
+  Z_MARK_OP;
+  Z_CHECK(arg1); Z_CHECK(arg2);
+#if Z_FAST_PATH
+  /* Value-equal small integers are equal.
+     Pointer-equal big integers are equal as well. */
+  if (arg1 == arg2) return Val_long(0);
+  if (Is_long(arg2)) {
+    if (Is_long(arg1)) {
+      return arg1 > arg2 ? Val_long(1) : Val_long(-1);
+    } else {
+      /* Either arg1 is positive and arg1 > Z_MAX_INT >= arg2 -> result +1
+             or arg1 is negative and arg1 < Z_MIN_INT <= arg2 -> result -1 */
+      return Z_SIGN(arg1) ? Val_long(-1) : Val_long(1);
+    }
+  }
+  else if (Is_long(arg1)) {
+    /* Either arg2 is positive and arg2 > Z_MAX_INT >= arg1 -> result -1
+           or arg2 is negative and arg2 < Z_MIN_INT <= arg1 -> result +1 */
+    return Z_SIGN(arg2) ? Val_long(1) : Val_long(-1);
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  Z_ARG(arg1);
+  Z_ARG(arg2);
+  r = 0;
+  if (sign_arg1 != sign_arg2) r = 1;
+  else if (size_arg1 > size_arg2) r = 1;
+  else if (size_arg1 < size_arg2) r = -1;
+  else {
+    mp_size_t i;
+    for (i = size_arg1 - 1; i >= 0; i--) {
+      if (ptr_arg1[i] > ptr_arg2[i]) { r = 1; break; }
+      if (ptr_arg1[i] < ptr_arg2[i]) { r = -1; break; }
+    }
+  }
+  if (sign_arg1) r = -r;
+  return Val_long(r);
+}
+
+CAMLprim value ml_z_equal(value arg1, value arg2)
+{
+  mp_size_t i;
+  Z_DECL(arg1); Z_DECL(arg2);
+  Z_MARK_OP;
+  Z_CHECK(arg1); Z_CHECK(arg2);
+#if Z_FAST_PATH
+  /* Value-equal small integers are equal.
+     Pointer-equal big integers are equal as well. */
+  if (arg1 == arg2) return Val_true;
+  /* If both arg1 and arg2 are small integers but failed the equality
+     test above, they are different.
+     If one of arg1/arg2 is a small integer and the other is a big integer,
+     they are different: one is in the range [Z_MIN_INT,Z_MAX_INT]
+     and the other is outside this range. */
+  if (Is_long(arg2) || Is_long(arg1)) return Val_false;
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  Z_ARG(arg1);
+  Z_ARG(arg2);
+  if (sign_arg1 != sign_arg2 || size_arg1 != size_arg2) return Val_false;
+  for (i = 0; i < size_arg1; i++)
+    if (ptr_arg1[i] != ptr_arg2[i]) return Val_false;
+  return Val_true;
+}
+
+int ml_z_sgn(value arg)
+{
+  if (Is_long(arg)) {
+    if (arg > Val_long(0)) return 1;
+    else if (arg < Val_long(0)) return -1;
+    else return 0;
+  }
+  else {
+    Z_MARK_SLOW;
+#if !Z_USE_NATINT
+    /* In "use natint" mode, zero is a small integer, treated above */
+    if (!Z_SIZE(arg)) return 0;
+#endif
+    if (Z_SIGN(arg)) return -1; else return 1;
+  }
+}
+CAMLprim value ml_z_sign(value arg)
+{
+  Z_MARK_OP;
+  Z_CHECK(arg);
+  return Val_long(ml_z_sgn(arg));
+}
+
+CAMLprim value ml_z_fits_int(value v)
+{
+  intnat x;
+  Z_DECL(v);
+  Z_MARK_OP;
+  Z_CHECK(v);
+  if (Is_long(v)) return Val_true;
+  Z_MARK_SLOW;
+  Z_ARG(v);
+  if (size_v > 1) return Val_false;
+  if (!size_v) return Val_true;
+  x = *ptr_v;
+  if (sign_v) {
+    if ((uintnat)x > Z_HI_INT) return Val_false;
+  }
+  else {
+    if ((uintnat)x >= Z_HI_INT) return Val_false;
+  }
+  return Val_true;
+}
+
+CAMLprim value ml_z_fits_nativeint(value v)
+{
+  intnat x;
+  Z_DECL(v);
+  Z_MARK_OP;
+  Z_CHECK(v);
+  if (Is_long(v)) return Val_true;
+  Z_MARK_SLOW;
+  Z_ARG(v);
+  if (size_v > 1) return Val_false;
+  if (!size_v) return Val_true;
+  x = *ptr_v;
+  if (sign_v) {
+    if ((uintnat)x > Z_HI_INTNAT) return Val_false;
+  }
+  else {
+    if ((uintnat)x >= Z_HI_INTNAT) return Val_false;
+  }
+  return Val_true;
+}
+
+CAMLprim value ml_z_fits_int32(value v)
+{
+  intnat x;
+  Z_MARK_OP;
+  Z_CHECK(v);
+  if (Is_long(v)) {
+#ifdef ARCH_SIXTYFOUR
+    x = Long_val(v);
+    if (x >= (intnat)Z_HI_INT32 || x < -(intnat)Z_HI_INT32)
+      return Val_false;
+#endif
+    return Val_true;
+  }
+  else {
+    Z_DECL(v);
+    Z_MARK_SLOW;
+    Z_ARG(v);
+    if (size_v > 1) return Val_false;
+    if (!size_v) return Val_true;
+    x = *ptr_v;
+    if (sign_v) {
+      if ((uintnat)x > Z_HI_INT32) return Val_false;
+    }
+    else {
+      if ((uintnat)x >= Z_HI_INT32) return Val_false;
+    }
+    return Val_true;
+  }
+}
+
+CAMLprim value ml_z_fits_int64(value v)
+{
+  int64_t x;
+  Z_DECL(v);
+  Z_MARK_OP;
+  Z_CHECK(v);
+  if (Is_long(v)) return Val_true;
+  Z_MARK_SLOW;
+  Z_ARG(v);
+  switch (size_v) {
+  case 0: return Val_true;
+  case 1: x = ptr_v[0]; break;
+#ifndef ARCH_SIXTYFOUR
+  case 2: x = ptr_v[0] | ((uint64_t)ptr_v[1] << 32); break;
+#endif
+  default: return Val_false;
+  }
+  if (sign_v) {
+    if ((uint64_t)x > Z_HI_INT64) return Val_false;
+  }
+  else {
+    if ((uint64_t)x >= Z_HI_INT64) return Val_false;
+  }
+  return Val_true;
+}
+
+CAMLprim value ml_z_size(value v)
+{
+  Z_MARK_OP;
+  if (Is_long(v)) return Val_long(1);
+  else return Val_long(Z_SIZE(v));
+}
+
+
+/*---------------------------------------------------
+  ARITHMETIC OPERATORS
+  ---------------------------------------------------*/
+
+CAMLprim value ml_z_neg(value arg)
+{
+  Z_MARK_OP;
+  Z_CHECK(arg);
+#if Z_FAST_PATH && !Z_FAST_PATH_IN_OCAML
+  if (Is_long(arg)) {
+    /* fast path */
+    if (arg > Val_long(Z_MIN_INT)) return 2 - arg;
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  {
+    CAMLparam1(arg);
+    value r;
+    Z_DECL(arg);
+    Z_ARG(arg);
+    r = ml_z_alloc(size_arg);
+    Z_REFRESH(arg);
+    ml_z_cpy_limb(Z_LIMB(r), ptr_arg, size_arg);
+    r = ml_z_reduce(r, size_arg, sign_arg ^ Z_SIGN_MASK);
+    Z_CHECK(r);
+    CAMLreturn(r);
+  }
+}
+
+CAMLprim value ml_z_abs(value arg)
+{
+  Z_MARK_OP;
+  Z_CHECK(arg);
+#if Z_FAST_PATH && !Z_FAST_PATH_IN_OCAML
+  if (Is_long(arg)) {
+    /* fast path */
+    if (arg >= Val_long(0)) return arg;
+    if (arg > Val_long(Z_MIN_INT)) return 2 - arg;
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  {
+    CAMLparam1(arg);
+    Z_DECL(arg);
+    value r;
+    Z_ARG(arg);
+    if (sign_arg) {
+      r = ml_z_alloc(size_arg);
+      Z_REFRESH(arg);
+      ml_z_cpy_limb(Z_LIMB(r), ptr_arg, size_arg);
+      r = ml_z_reduce(r, size_arg, 0);
+      Z_CHECK(r);
+    }
+    else r = arg;
+    CAMLreturn(r);
+  }
+}
+
+/* helper function for add/sub */
+static value ml_z_addsub(value arg1, value arg2, intnat sign)
+{
+  CAMLparam2(arg1,arg2);
+  Z_DECL(arg1); Z_DECL(arg2);
+  value r;
+  mp_limb_t c;
+  Z_ARG(arg1);
+  Z_ARG(arg2);
+  sign_arg2 ^= sign;
+  if (!size_arg2) r = arg1;
+  else if (!size_arg1) {
+    if (sign) {
+      /* negation */
+      r = ml_z_alloc(size_arg2);
+      Z_REFRESH(arg2);
+      ml_z_cpy_limb(Z_LIMB(r), ptr_arg2, size_arg2);
+      r = ml_z_reduce(r, size_arg2, sign_arg2);
+    }
+    else r = arg2;
+  }
+  else if (sign_arg1 == sign_arg2) {
+    /* addition */
+    if (size_arg1 >= size_arg2) {
+      r = ml_z_alloc(size_arg1 + 1);
+      Z_REFRESH(arg1);
+      Z_REFRESH(arg2);
+      c = mpn_add(Z_LIMB(r), ptr_arg1, size_arg1, ptr_arg2, size_arg2);
+      Z_LIMB(r)[size_arg1] = c;
+      r = ml_z_reduce(r, size_arg1+1, sign_arg1);
+    }
+    else {
+      r = ml_z_alloc(size_arg2 + 1);
+      Z_REFRESH(arg1);
+      Z_REFRESH(arg2);
+      c = mpn_add(Z_LIMB(r), ptr_arg2, size_arg2, ptr_arg1, size_arg1);
+      Z_LIMB(r)[size_arg2] = c;
+      r = ml_z_reduce(r, size_arg2+1, sign_arg1);
+    }
+  }
+  else {
+    /* subtraction */
+    if (size_arg1 > size_arg2) {
+      r = ml_z_alloc(size_arg1);
+      Z_REFRESH(arg1);
+      Z_REFRESH(arg2);
+      mpn_sub(Z_LIMB(r), ptr_arg1, size_arg1, ptr_arg2, size_arg2);
+      r = ml_z_reduce(r, size_arg1, sign_arg1);
+   }
+    else if (size_arg1 < size_arg2) {
+      r = ml_z_alloc(size_arg2);
+      Z_REFRESH(arg1);
+      Z_REFRESH(arg2);
+      mpn_sub(Z_LIMB(r), ptr_arg2, size_arg2, ptr_arg1, size_arg1);
+      r = ml_z_reduce(r, size_arg2, sign_arg2);
+    }
+    else {
+      int cmp = mpn_cmp(ptr_arg1, ptr_arg2, size_arg1);
+      if (cmp > 0) {
+        r = ml_z_alloc(size_arg1+1);
+        Z_REFRESH(arg1);
+        Z_REFRESH(arg2);
+        mpn_sub_n(Z_LIMB(r), ptr_arg1, ptr_arg2, size_arg1);
+        r = ml_z_reduce(r, size_arg1, sign_arg1);
+     }
+      else if (cmp < 0) {
+        r = ml_z_alloc(size_arg1);
+        Z_REFRESH(arg1);
+        Z_REFRESH(arg2);
+        mpn_sub_n(Z_LIMB(r), ptr_arg2, ptr_arg1, size_arg1);
+        r = ml_z_reduce(r, size_arg1, sign_arg2);
+      }
+      else r = Val_long(0);
+    }
+  }
+  Z_CHECK(r);
+  CAMLreturn(r);
+}
+
+CAMLprim value ml_z_add(value arg1, value arg2)
+{
+  Z_MARK_OP;
+  Z_CHECK(arg1); Z_CHECK(arg2);
+#if Z_FAST_PATH && !Z_FAST_PATH_IN_OCAML
+  if (Is_long(arg1) && Is_long(arg2)) {
+    /* fast path */
+    intnat a1 = Long_val(arg1);
+    intnat a2 = Long_val(arg2);
+    intnat v = a1 + a2;
+    if (Z_FITS_INT(v)) return Val_long(v);
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  return ml_z_addsub(arg1, arg2, 0);
+}
+
+CAMLprim value ml_z_sub(value arg1, value arg2)
+{
+  Z_MARK_OP;
+  Z_CHECK(arg1); Z_CHECK(arg2);
+#if Z_FAST_PATH && !Z_FAST_PATH_IN_OCAML
+  if (Is_long(arg1) && Is_long(arg2)) {
+    /* fast path */
+    intnat a1 = Long_val(arg1);
+    intnat a2 = Long_val(arg2);
+    intnat v = a1 - a2;
+    if (Z_FITS_INT(v)) return Val_long(v);
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  return ml_z_addsub(arg1, arg2, Z_SIGN_MASK);
+}
+
+CAMLprim value ml_z_mul_overflows(value vx, value vy)
+{
+#if HAS_BUILTIN(__builtin_mul_overflow) || __GNUC__ >= 5
+  intnat z;
+  return Val_bool(__builtin_mul_overflow(vx - 1, vy >> 1, &z));
+#elif defined(__GNUC__) && defined(__x86_64__)
+  intnat z;
+  unsigned char o;
+  asm("imulq %1, %3; seto %0"
+      : "=q" (o), "=r" (z)
+      : "1" (vx - 1), "r" (vy >> 1)
+      : "cc");
+  return Val_int(o);
+#elif defined(_MSC_VER) && defined(_M_X64)
+  intnat hi, lo;
+  lo = _mul128(vx - 1, vy >> 1, &hi);
+  return Val_bool(hi != lo >> 63);
+#else
+  /* Portable C code */
+  intnat x = Long_val(vx);
+  intnat y = Long_val(vy);
+  /* Quick approximate check for small values of x and y.
+     Also catches the cases x = 0, x = 1, y = 0, y = 1. */
+  if (Z_FITS_HINT(x)) {
+    if (Z_FITS_HINT(y)) return Val_false;
+    if ((uintnat) x <= 1) return Val_false;
+  }
+  if ((uintnat) y <= 1) return Val_false;
+#if 1
+  /* Give up at this point; we'll go through the general case in ml_z_mul */
+  return Val_true;
+#else
+  /* The product x*y is representable as an unboxed integer if
+     it is in [Z_MIN_INT, Z_MAX_INT].
+     x >= 0 y >= 0:  x*y >= 0 and x*y <= Z_MAX_INT <-> y <= Z_MAX_INT / x
+     x < 0  y >= 0:  x*y <= 0 and x*y >= Z_MIN_INT <-> x >= Z_MIN_INT / y
+     x >= 0 y < 0 :  x*y <= 0 and x*y >= Z_MIN_INT <-> y >= Z_MIN_INT / x
+     x < 0  y < 0 :  x*y >= 0 and x*y <= Z_MAX_INT <-> x >= Z_MAX_INT / y */
+  if (x >= 0)
+    if (y >= 0)
+      return Val_bool(y > Z_MAX_INT / x);
+    else
+      return Val_bool(y < Z_MIN_INT / x);
+  else
+    if (y >= 0)
+      return Val_bool(x < Z_MIN_INT / y);
+    else
+      return Val_bool(x < Z_MAX_INT / y);
+#endif
+#endif
+}
+
+CAMLprim value ml_z_mul(value arg1, value arg2)
+{
+  Z_DECL(arg1); Z_DECL(arg2);
+  Z_MARK_OP;
+  Z_CHECK(arg1); Z_CHECK(arg2);
+#if Z_FAST_PATH && !Z_FAST_PATH_IN_OCAML
+  if (Is_long(arg1) && Is_long(arg2) &&
+      ml_z_mul_overflows(arg1, arg2) == Val_false) {
+    return Val_long(Long_val(a1) * Long_val(a2));
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  Z_ARG(arg1);
+  Z_ARG(arg2);
+  if (!size_arg1 || !size_arg2) return Val_long(0);
+  {
+    CAMLparam2(arg1,arg2);
+    value r = ml_z_alloc(size_arg1 + size_arg2);
+    mp_limb_t c;
+    Z_REFRESH(arg1);
+    Z_REFRESH(arg2);
+    if (size_arg2 == 1) {
+      c = mpn_mul_1(Z_LIMB(r), ptr_arg1, size_arg1, *ptr_arg2);
+      Z_LIMB(r)[size_arg1] = c;
+    }
+    else if (size_arg1 == 1) {
+      c = mpn_mul_1(Z_LIMB(r), ptr_arg2, size_arg2, *ptr_arg1);
+      Z_LIMB(r)[size_arg2] = c;
+    }
+#if HAVE_NATIVE_mpn_mul_2 /* untested */
+    else if (size_arg2 == 2) {
+      c = mpn_mul_2(Z_LIMB(r), ptr_arg1, size_arg1, ptr_arg2);
+      Z_LIMB(r)[size_arg1 + 1] = c;
+    }
+    else if (size_arg1 == 2) {
+      c = mpn_mul_2(Z_LIMB(r), ptr_arg2, size_arg2, ptr_arg1);
+      Z_LIMB(r)[size_arg2 + 1] = c;
+    }
+#endif
+    else if (size_arg1 > size_arg2)
+      mpn_mul(Z_LIMB(r), ptr_arg1, size_arg1, ptr_arg2, size_arg2);
+    else if (size_arg1 < size_arg2)
+      mpn_mul(Z_LIMB(r), ptr_arg2, size_arg2, ptr_arg1, size_arg1);
+/* older GMP don't have mpn_sqr, so we make the optimisation optional */
+#ifdef mpn_sqr
+    else if (ptr_arg1 == ptr_arg2)
+      mpn_sqr(Z_LIMB(r), ptr_arg1, size_arg1);
+#endif
+    else
+      mpn_mul_n(Z_LIMB(r), ptr_arg1, ptr_arg2, size_arg1);
+    r = ml_z_reduce(r, size_arg1 + size_arg2, sign_arg1^sign_arg2);
+    Z_CHECK(r);
+    CAMLreturn(r);
+  }
+}
+
+/* helper function for division: returns truncated quotient and remainder */
+static value ml_z_tdiv_qr(value arg1, value arg2)
+{
+  CAMLparam2(arg1, arg2);
+  CAMLlocal3(q, r, p);
+  Z_DECL(arg1); Z_DECL(arg2);
+  Z_ARG(arg1);  Z_ARG(arg2);
+  if (!size_arg2) ml_z_raise_divide_by_zero();
+  if (size_arg1 >= size_arg2) {
+    q = ml_z_alloc(size_arg1 - size_arg2 + 1);
+    r = ml_z_alloc(size_arg2);
+    Z_REFRESH(arg1); Z_REFRESH(arg2);
+    mpn_tdiv_qr(Z_LIMB(q), Z_LIMB(r), 0,
+                ptr_arg1, size_arg1, ptr_arg2, size_arg2);
+    q = ml_z_reduce(q, size_arg1 - size_arg2 + 1, sign_arg1 ^ sign_arg2);
+    r = ml_z_reduce(r, size_arg2, sign_arg1);
+  }
+  else {
+    q = Val_long(0);
+    r = arg1;
+  }
+  Z_CHECK(q);
+  Z_CHECK(r);
+  p = caml_alloc_small(2, 0);
+  Field(p,0) = q;
+  Field(p,1) = r;
+  CAMLreturn(p);
+}
+
+CAMLprim value ml_z_div_rem(value arg1, value arg2)
+{
+  Z_MARK_OP;
+  Z_CHECK(arg1); Z_CHECK(arg2);
+#if Z_FAST_PATH
+  if (Is_long(arg1) && Is_long(arg2)) {
+    /* fast path */
+    intnat a1 = Long_val(arg1);
+    intnat a2 = Long_val(arg2);
+    intnat q, r;
+    if (!a2) ml_z_raise_divide_by_zero();
+    q = a1 / a2;
+    r = a1 % a2;
+    if (Z_FITS_INT(q) && Z_FITS_INT(r)) {
+      value p = caml_alloc_small(2, 0);
+      Field(p,0) = Val_long(q);
+      Field(p,1) = Val_long(r);
+      return p;
+    }
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  return ml_z_tdiv_qr(arg1, arg2);
+}
+
+CAMLprim value ml_z_div(value arg1, value arg2)
+{
+  Z_MARK_OP;
+  Z_CHECK(arg1); Z_CHECK(arg2);
+#if Z_FAST_PATH && !Z_FAST_PATH_IN_OCAML
+  if (Is_long(arg1) && Is_long(arg2)) {
+    /* fast path */
+    intnat a1 = Long_val(arg1);
+    intnat a2 = Long_val(arg2);
+    intnat q;
+    if (!a2) ml_z_raise_divide_by_zero();
+    q = a1 / a2;
+    if (Z_FITS_INT(q)) return Val_long(q);
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  return Field(ml_z_tdiv_qr(arg1, arg2), 0);
+}
+
+CAMLprim value ml_z_rem(value arg1, value arg2)
+{
+  Z_MARK_OP;
+  Z_CHECK(arg1); Z_CHECK(arg2);
+#if Z_FAST_PATH && !Z_FAST_PATH_IN_OCAML
+  if (Is_long(arg1) && Is_long(arg2)) {
+    /* fast path */
+    intnat a1 = Long_val(arg1);
+    intnat a2 = Long_val(arg2);
+    intnat r;
+    if (!a2) ml_z_raise_divide_by_zero();
+    r = a1 % a2;
+    if (Z_FITS_INT(r)) return Val_long(r);
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  return Field(ml_z_tdiv_qr(arg1, arg2), 1);
+}
+
+/* helper function for division with rounding towards +oo / -oo */
+static value ml_z_rdiv(value arg1, value arg2, intnat dir)
+{
+  CAMLparam2(arg1, arg2);
+  CAMLlocal2(q, r);
+  Z_DECL(arg1); Z_DECL(arg2);
+  Z_ARG(arg1);  Z_ARG(arg2);
+  if (!size_arg2) ml_z_raise_divide_by_zero();
+  if (size_arg1 >= size_arg2) {
+    mp_limb_t c = 0;
+    q = ml_z_alloc(size_arg1 - size_arg2 + 2);
+    r = ml_z_alloc(size_arg2);
+    Z_REFRESH(arg1); Z_REFRESH(arg2);
+    mpn_tdiv_qr(Z_LIMB(q), Z_LIMB(r), 0,
+                ptr_arg1, size_arg1, ptr_arg2, size_arg2);
+    if ((sign_arg1 ^ sign_arg2) == dir) {
+      /* outward rounding */
+      mp_size_t sz;
+      for (sz = size_arg2; sz > 0 && !Z_LIMB(r)[sz-1]; sz--);
+      if (sz) {
+        /* r != 0: needs adjustment */
+        c = mpn_add_1(Z_LIMB(q), Z_LIMB(q), size_arg1 - size_arg2 + 1, 1);
+      }
+    }
+    Z_LIMB(q)[size_arg1 - size_arg2 + 1] = c;
+    q = ml_z_reduce(q, size_arg1 - size_arg2 + 2, sign_arg1 ^ sign_arg2);
+  }
+  else {
+    if (size_arg1 && (sign_arg1 ^ sign_arg2) == dir) {
+      if (dir) q = Val_long(-1);
+      else q = Val_long(1);
+    }
+    else q = Val_long(0);
+  }
+  Z_CHECK(q);
+  CAMLreturn(q);
+}
+
+CAMLprim value ml_z_cdiv(value arg1, value arg2)
+{
+  Z_MARK_OP;
+  Z_CHECK(arg1); Z_CHECK(arg2);
+#if Z_FAST_PATH
+  if (Is_long(arg1) && Is_long(arg2)) {
+    /* fast path */
+    intnat a1 = Long_val(arg1);
+    intnat a2 = Long_val(arg2);
+    intnat q;
+    if (!a2) ml_z_raise_divide_by_zero();
+    /* adjust to round towards +oo */
+    if (a1 > 0 && a2 > 0) a1 += a2-1;
+    else if (a1 < 0 && a2 < 0) a1 += a2+1;
+    q = a1 / a2;
+    if (Z_FITS_INT(q)) return Val_long(q);
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  return ml_z_rdiv(arg1, arg2, 0);
+}
+
+CAMLprim value ml_z_fdiv(value arg1, value arg2)
+{
+  Z_MARK_OP;
+  Z_CHECK(arg1); Z_CHECK(arg2);
+#if Z_FAST_PATH
+  if (Is_long(arg1) && Is_long(arg2)) {
+    /* fast path */
+    intnat a1 = Long_val(arg1);
+    intnat a2 = Long_val(arg2);
+    intnat q;
+    if (!a2) ml_z_raise_divide_by_zero();
+    /* adjust to round towards -oo */
+    if (a1 < 0 && a2 > 0) a1 -= a2-1;
+    else if (a1 > 0 && a2 < 0) a1 -= a2+1;
+    q = a1 / a2;
+    if (Z_FITS_INT(q)) return Val_long(q);
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  return ml_z_rdiv(arg1, arg2, Z_SIGN_MASK);
+}
+
+/* helper function for succ / pred */
+static value ml_z_succpred(value arg, intnat sign)
+{
+  CAMLparam1(arg);
+  Z_DECL(arg);
+  value r;
+  Z_ARG(arg);
+  r = ml_z_alloc(size_arg + 1);
+  Z_REFRESH(arg);
+  if (!size_arg) {
+    Z_LIMB(r)[0] = 1;
+    r = ml_z_reduce(r, 1, sign);
+  }
+  else if (sign_arg == sign) {
+    /* add 1 */
+    mp_limb_t c = mpn_add_1(Z_LIMB(r), ptr_arg, size_arg, 1);
+    Z_LIMB(r)[size_arg] = c;
+    r = ml_z_reduce(r, size_arg + 1, sign_arg);
+  }
+  else {
+    /* subtract 1 */
+    mpn_sub_1(Z_LIMB(r), ptr_arg, size_arg, 1);
+    r = ml_z_reduce(r, size_arg, sign_arg);
+  }
+  Z_CHECK(r);
+  CAMLreturn(r);
+}
+
+CAMLprim value ml_z_succ(value arg)
+{
+  Z_MARK_OP;
+  Z_CHECK(arg);
+#if Z_FAST_PATH && !Z_FAST_PATH_IN_OCAML
+  if (Is_long(arg)) {
+    /* fast path */
+    if (arg < Val_long(Z_MAX_INT)) return arg + 2;
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  return ml_z_succpred(arg, 0);
+}
+
+CAMLprim value ml_z_pred(value arg)
+{
+  Z_MARK_OP;
+  Z_CHECK(arg);
+#if Z_FAST_PATH && !Z_FAST_PATH_IN_OCAML
+  if (Is_long(arg)) {
+    /* fast path */
+     if (arg > Val_long(Z_MIN_INT)) return arg - 2;
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  return ml_z_succpred(arg, Z_SIGN_MASK);
+}
+
+CAMLprim value ml_z_sqrt(value arg)
+{
+  /* XXX TODO: fast path */
+  CAMLparam1(arg);
+  Z_DECL(arg);
+  value r;
+  Z_MARK_OP;
+  Z_MARK_SLOW;
+  Z_CHECK(arg);
+  Z_ARG(arg);
+  if (sign_arg)
+    caml_invalid_argument("Z.sqrt: square root of a negative number");
+  if (size_arg) {
+    mp_size_t sz = (size_arg + 1) / 2;
+    r = ml_z_alloc(sz);
+    Z_REFRESH(arg);
+    mpn_sqrtrem(Z_LIMB(r), NULL, ptr_arg, size_arg);
+    r = ml_z_reduce(r, sz, 0);
+  }
+  else r = Val_long(0);
+  Z_CHECK(r);
+  CAMLreturn(r);
+}
+
+CAMLprim value ml_z_sqrt_rem(value arg)
+{
+  CAMLparam1(arg);
+  CAMLlocal3(r, s, p);
+  Z_DECL(arg);
+  /* XXX TODO: fast path */
+  Z_MARK_OP;
+  Z_MARK_SLOW;
+  Z_CHECK(arg);
+  Z_ARG(arg);
+  if (sign_arg)
+    caml_invalid_argument("Z.sqrt_rem: square root of a negative number");
+  if (size_arg) {
+    mp_size_t sz = (size_arg + 1) / 2, sz2;
+    r = ml_z_alloc(sz);
+    s = ml_z_alloc(size_arg);
+    Z_REFRESH(arg);
+    sz2 = mpn_sqrtrem(Z_LIMB(r), Z_LIMB(s), ptr_arg, size_arg);
+    r = ml_z_reduce(r, sz, 0);
+    s = ml_z_reduce(s, sz2, 0);
+  }
+  else r = s = Val_long(0);
+  Z_CHECK(r);
+  Z_CHECK(s);
+  p = caml_alloc_small(2, 0);
+  Field(p,0) = r;
+  Field(p,1) = s;
+  CAMLreturn(p);
+}
+
+CAMLprim value ml_z_gcd(value arg1, value arg2)
+{
+  Z_MARK_OP;
+  Z_CHECK(arg1); Z_CHECK(arg2);
+#if Z_FAST_PATH
+  if (Is_long(arg1) && Is_long(arg2)) {
+    /* fast path */
+    intnat a1 = Long_val(arg1);
+    intnat a2 = Long_val(arg2);
+    if (a1 < 0) a1 = -a1;
+    if (a2 < 0) a2 = -a2;
+    if (a1 < a2) { intnat t = a1; a1 = a2; a2 = t; }
+    while (a2)  {
+      intnat r = a1 % a2;
+      a1 = a2; a2 = r;
+    }
+    /* If arg1 = arg2 = min_int, the result a1 is -min_int, not representable
+       as a tagged integer; fall through the slow case, then. */
+    if (a1 <= Z_MAX_INT) return Val_long(a1);
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  {
+    CAMLparam2(arg1, arg2);
+    CAMLlocal3(r, tmp1, tmp2);
+    mp_size_t sz, pos1, pos2, limb1, limb2, bit1, bit2, pos, limb, bit, i;
+    Z_DECL(arg1); Z_DECL(arg2);
+    Z_ARG(arg1);  Z_ARG(arg2);
+    if (!size_arg1) r = sign_arg2 ? ml_z_neg(arg2) : arg2;
+    else if (!size_arg2) r = sign_arg1 ? ml_z_neg(arg1) : arg1;
+    else {
+      /* copy args to tmp storage & remove lower 0 bits */
+      pos1 = mpn_scan1(ptr_arg1, 0);
+      pos2 = mpn_scan1(ptr_arg2, 0);
+      limb1 = pos1 / Z_LIMB_BITS;
+      limb2 = pos2 / Z_LIMB_BITS;
+      bit1 = pos1 % Z_LIMB_BITS;
+      bit2 = pos2 % Z_LIMB_BITS;
+      size_arg1 -= limb1;
+      size_arg2 -= limb2;
+      tmp1 = ml_z_alloc(size_arg1 + 1);
+      tmp2 = ml_z_alloc(size_arg2 + 1);
+      Z_REFRESH(arg1);
+      Z_REFRESH(arg2);
+      if (bit1) {
+        mpn_rshift(Z_LIMB(tmp1), ptr_arg1 + limb1, size_arg1, bit1);
+        if (!Z_LIMB(tmp1)[size_arg1-1]) size_arg1--;
+      }
+      else ml_z_cpy_limb(Z_LIMB(tmp1), ptr_arg1 + limb1, size_arg1);
+      if (bit2) {
+        mpn_rshift(Z_LIMB(tmp2), ptr_arg2 + limb2, size_arg2, bit2);
+        if (!Z_LIMB(tmp2)[size_arg2-1]) size_arg2--;
+      }
+      else ml_z_cpy_limb(Z_LIMB(tmp2), ptr_arg2 + limb2, size_arg2);
+      /* compute gcd of 2^pos1 & 2^pos2 */
+      pos = (pos1 <= pos2) ? pos1 : pos2;
+      limb = pos / Z_LIMB_BITS;
+      bit = pos % Z_LIMB_BITS;
+      /* compute gcd of arg1 & arg2 without lower 0 bits */
+      /* second argument must have less bits than first  */
+      if ((size_arg1 > size_arg2) ||
+          ((size_arg1 == size_arg2) &&
+           (Z_LIMB(tmp1)[size_arg1 - 1] >= Z_LIMB(tmp2)[size_arg1 - 1]))) {
+        r = ml_z_alloc(size_arg2 + limb + 1);
+        sz = mpn_gcd(Z_LIMB(r) + limb, Z_LIMB(tmp1), size_arg1, Z_LIMB(tmp2), size_arg2);
+      }
+      else {
+        r = ml_z_alloc(size_arg1 + limb + 1);
+        sz = mpn_gcd(Z_LIMB(r) + limb, Z_LIMB(tmp2), size_arg2, Z_LIMB(tmp1), size_arg1);
+      }
+      /* glue the two results */
+      for (i = 0; i < limb; i++)
+        Z_LIMB(r)[i] = 0;
+      Z_LIMB(r)[sz + limb] = 0;
+      if (bit) mpn_lshift(Z_LIMB(r) + limb, Z_LIMB(r) + limb, sz + 1, bit);
+      r = ml_z_reduce(r, limb + sz + 1, 0);
+    }
+    Z_CHECK(r);
+    CAMLreturn(r);
+  }
+}
+
+/* only computes one cofactor */
+CAMLprim value ml_z_gcdext_intern(value arg1, value arg2)
+{
+  /* XXX TODO: fast path */
+  CAMLparam2(arg1, arg2);
+  CAMLlocal5(r, res_arg1, res_arg2, s, p);
+  Z_DECL(arg1); Z_DECL(arg2);
+  mp_size_t sz, sn;
+  Z_MARK_OP;
+  Z_MARK_SLOW;
+  Z_CHECK(arg1);  Z_CHECK(arg2);
+  Z_ARG(arg1);    Z_ARG(arg2);
+  if (!size_arg1 || !size_arg2) ml_z_raise_divide_by_zero();
+  /* copy args to tmp storage */
+  res_arg1 = ml_z_alloc(size_arg1 + 1);
+  res_arg2 = ml_z_alloc(size_arg2 + 1);
+  Z_REFRESH(arg1);
+  Z_REFRESH(arg2);
+  ml_z_cpy_limb(Z_LIMB(res_arg1), ptr_arg1, size_arg1);
+  ml_z_cpy_limb(Z_LIMB(res_arg2), ptr_arg2, size_arg2);
+  /* must have arg1 >= arg2 */
+  if ((size_arg1 > size_arg2) ||
+      ((size_arg1 == size_arg2) &&
+       (mpn_cmp(Z_LIMB(res_arg1), Z_LIMB(res_arg2), size_arg1)  >= 0))) {
+    r = ml_z_alloc(size_arg1 + 1);
+    s = ml_z_alloc(size_arg1 + 1);
+    sz = mpn_gcdext(Z_LIMB(r), Z_LIMB(s), &sn,
+                    Z_LIMB(res_arg1), size_arg1, Z_LIMB(res_arg2), size_arg2);
+    p = caml_alloc_small(3, 0);
+    Field(p,2) = Val_true;
+  }
+  else {
+    r = ml_z_alloc(size_arg2 + 1);
+    s = ml_z_alloc(size_arg2 + 1);
+    sz = mpn_gcdext(Z_LIMB(r), Z_LIMB(s), &sn,
+                    Z_LIMB(res_arg2), size_arg2, Z_LIMB(res_arg1), size_arg1);
+    p = caml_alloc_small(3, 0);
+    Field(p,2) = Val_false;
+    sign_arg1 = sign_arg2;
+  }
+  /* pack result */
+  r = ml_z_reduce(r, sz, 0);
+  if ((int)sn >= 0) s = ml_z_reduce(s, sn, sign_arg1);
+  else s = ml_z_reduce(s, -sn, sign_arg1 ^ Z_SIGN_MASK);
+  Z_CHECK(r);
+  Z_CHECK(s);
+  Field(p,0) = r;
+  Field(p,1) = s;
+  CAMLreturn(p);
+}
+
+
+/*---------------------------------------------------
+  BITWISE OPERATORS
+  ---------------------------------------------------*/
+
+CAMLprim value ml_z_logand(value arg1, value arg2)
+{
+  Z_MARK_OP;
+  Z_CHECK(arg1); Z_CHECK(arg2);
+#if Z_FAST_PATH && !Z_FAST_PATH_IN_OCAML
+  if (Is_long(arg1) && Is_long(arg2)) {
+    /* fast path */
+    return arg1 & arg2;
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  {
+    CAMLparam2(arg1,arg2);
+    value r;
+    mp_size_t i;
+    mp_limb_t c;
+    Z_DECL(arg1); Z_DECL(arg2);
+    Z_ARG(arg1);  Z_ARG(arg2);
+    /* ensure size_arg1 >= size_arg2 */
+    if (size_arg1 < size_arg2) {
+      mp_size_t sz;
+      mp_limb_t *p, s;
+      value a;
+      sz = size_arg1; size_arg1 = size_arg2; size_arg2 = sz;
+      p = ptr_arg1; ptr_arg1 = ptr_arg2; ptr_arg2 = p;
+      s = sign_arg1; sign_arg1 = sign_arg2; sign_arg2 = s;
+      a = arg1; arg1 = arg2; arg2 = a;
+    }
+    if (!size_arg2) r = arg2;
+    else if (sign_arg1 && sign_arg2) {
+      /* arg1 < 0, arg2 < 0 => r < 0 */
+      r = ml_z_alloc(size_arg1 + 1);
+      Z_REFRESH(arg1);
+      Z_REFRESH(arg2);
+      mpn_sub_1(Z_LIMB(r), ptr_arg1, size_arg1, 1);
+      c = 1; /* carry when decrementing arg2 */
+      for (i = 0; i < size_arg2; i++) {
+        mp_limb_t v = ptr_arg2[i];
+        Z_LIMB(r)[i] = Z_LIMB(r)[i] | (v - c);
+        c = c && !v;
+      }
+      c = mpn_add_1(Z_LIMB(r), Z_LIMB(r), size_arg1, 1);
+      Z_LIMB(r)[size_arg1] = c;
+      r = ml_z_reduce(r, size_arg1 + 1, Z_SIGN_MASK);
+    }
+    else if (sign_arg1) {
+      /* arg1 < 0, arg2 > 0 => r >= 0 */
+      r = ml_z_alloc(size_arg2);
+      Z_REFRESH(arg1);
+      Z_REFRESH(arg2);
+      mpn_sub_1(Z_LIMB(r), ptr_arg1, size_arg2, 1);
+      for (i = 0; i < size_arg2; i++)
+        Z_LIMB(r)[i] = (~Z_LIMB(r)[i]) & ptr_arg2[i];
+      r = ml_z_reduce(r, size_arg2, 0);
+    }
+    else if (sign_arg2) {
+      /* arg1 > 0, arg2 < 0 => r >= 0 */
+      r = ml_z_alloc(size_arg1);
+      Z_REFRESH(arg1);
+      Z_REFRESH(arg2);
+      mpn_sub_1(Z_LIMB(r), ptr_arg2, size_arg2, 1);
+      for (i = 0; i < size_arg2; i++)
+        Z_LIMB(r)[i] = ptr_arg1[i] & (~Z_LIMB(r)[i]);
+      for (; i < size_arg1; i++)
+        Z_LIMB(r)[i] = ptr_arg1[i];
+      r = ml_z_reduce(r, size_arg1, 0);
+    }
+    else {
+      /* arg1, arg2 > 0 => r >= 0 */
+      r = ml_z_alloc(size_arg2);
+      Z_REFRESH(arg1);
+      Z_REFRESH(arg2);
+      for (i = 0; i < size_arg2; i++)
+        Z_LIMB(r)[i] = ptr_arg1[i] & ptr_arg2[i];
+      r = ml_z_reduce(r, size_arg2, 0);
+    }
+    Z_CHECK(r);
+    CAMLreturn(r);
+  }
+}
+
+CAMLprim value ml_z_logor(value arg1, value arg2)
+{
+  Z_MARK_OP;
+  Z_CHECK(arg1); Z_CHECK(arg2);
+#if Z_FAST_PATH && !Z_FAST_PATH_IN_OCAML
+  if (Is_long(arg1) && Is_long(arg2)) {
+    /* fast path */
+    return arg1 | arg2;
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  {
+    CAMLparam2(arg1,arg2);
+    Z_DECL(arg1); Z_DECL(arg2);
+    mp_size_t i;
+    mp_limb_t c;
+    value r;
+    Z_ARG(arg1);  Z_ARG(arg2);
+    /* ensure size_arg1 >= size_arg2 */
+    if (size_arg1 < size_arg2) {
+      mp_size_t sz;
+      mp_limb_t *p, s;
+      value a;
+      sz = size_arg1; size_arg1 = size_arg2; size_arg2 = sz;
+      p = ptr_arg1; ptr_arg1 = ptr_arg2; ptr_arg2 = p;
+      s = sign_arg1; sign_arg1 = sign_arg2; sign_arg2 = s;
+      a = arg1; arg1 = arg2; arg2 = a;
+    }
+    if (!size_arg2) r = arg1;
+    else if (sign_arg1 && sign_arg2) {
+      /* arg1 < 0, arg2 < 0 => r < 0 */
+      r = ml_z_alloc(size_arg2 + 1);
+      Z_REFRESH(arg1);
+      Z_REFRESH(arg2);
+      mpn_sub_1(Z_LIMB(r), ptr_arg1, size_arg2, 1);
+      c = 1; /* carry when decrementing arg2 */
+      for (i = 0; i < size_arg2; i++) {
+        mp_limb_t v = ptr_arg2[i];
+        Z_LIMB(r)[i] = Z_LIMB(r)[i] & (v - c);
+        c = c && !v;
+      }
+      c = mpn_add_1(Z_LIMB(r), Z_LIMB(r), size_arg2, 1);
+      Z_LIMB(r)[size_arg2] = c;
+      r = ml_z_reduce(r, size_arg2 + 1, Z_SIGN_MASK);
+    }
+    else if (sign_arg1) {
+      /* arg1 < 0, arg2 > 0 => r < 0 */
+      r = ml_z_alloc(size_arg1 + 1);
+      Z_REFRESH(arg1);
+      Z_REFRESH(arg2);
+      mpn_sub_1(Z_LIMB(r), ptr_arg1, size_arg1, 1);
+      for (i = 0; i < size_arg2; i++)
+        Z_LIMB(r)[i] = Z_LIMB(r)[i] & (~ptr_arg2[i]);
+      c = mpn_add_1(Z_LIMB(r), Z_LIMB(r), size_arg1, 1);
+      Z_LIMB(r)[size_arg1] = c;
+      r = ml_z_reduce(r, size_arg1 + 1, Z_SIGN_MASK);
+    }
+    else if (sign_arg2) {
+      /* arg1 > 0, arg2 < 0 => r < 0*/
+      r = ml_z_alloc(size_arg2 + 1);
+      Z_REFRESH(arg1);
+      Z_REFRESH(arg2);
+      mpn_sub_1(Z_LIMB(r), ptr_arg2, size_arg2, 1);
+      for (i = 0; i < size_arg2; i++)
+        Z_LIMB(r)[i] = (~ptr_arg1[i]) & Z_LIMB(r)[i];
+      c = mpn_add_1(Z_LIMB(r), Z_LIMB(r), size_arg2, 1);
+      Z_LIMB(r)[size_arg2] = c;
+      r = ml_z_reduce(r, size_arg2 + 1, Z_SIGN_MASK);
+    }
+    else {
+      /* arg1, arg2 > 0 => r > 0 */
+      r = ml_z_alloc(size_arg1);
+      Z_REFRESH(arg1);
+      Z_REFRESH(arg2);
+      for (i = 0; i < size_arg2; i++)
+        Z_LIMB(r)[i] = ptr_arg1[i] | ptr_arg2[i];
+      for (; i < size_arg1; i++)
+        Z_LIMB(r)[i] = ptr_arg1[i];
+      r = ml_z_reduce(r, size_arg1, 0);
+    }
+    Z_CHECK(r);
+    CAMLreturn(r);
+  }
+}
+
+CAMLprim value ml_z_logxor(value arg1, value arg2)
+{
+  Z_MARK_OP;
+  Z_CHECK(arg1); Z_CHECK(arg2);
+#if Z_FAST_PATH && !Z_FAST_PATH_IN_OCAML
+  if (Is_long(arg1) && Is_long(arg2)) {
+    /* fast path */
+    return (arg1 ^ arg2) | 1;
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  {
+    CAMLparam2(arg1,arg2);
+    Z_DECL(arg1); Z_DECL(arg2);
+    value r;
+    mp_size_t i;
+    mp_limb_t c;
+    Z_ARG(arg1);  Z_ARG(arg2);
+    /* ensure size_arg1 >= size_arg2 */
+    if (size_arg1 < size_arg2) {
+      mp_size_t sz;
+      mp_limb_t *p, s;
+      value a;
+      sz = size_arg1; size_arg1 = size_arg2; size_arg2 = sz;
+      p = ptr_arg1; ptr_arg1 = ptr_arg2; ptr_arg2 = p;
+      s = sign_arg1; sign_arg1 = sign_arg2; sign_arg2 = s;
+      a = arg1; arg1 = arg2; arg2 = a;
+    }
+    if (!size_arg2) r = arg1;
+    else if (sign_arg1 && sign_arg2) {
+      /* arg1 < 0, arg2 < 0 => r >=0 */
+      r = ml_z_alloc(size_arg1);
+      Z_REFRESH(arg1);
+      Z_REFRESH(arg2);
+      mpn_sub_1(Z_LIMB(r), ptr_arg1, size_arg1, 1);
+      c = 1; /* carry when decrementing arg2 */
+      for (i = 0; i < size_arg2; i++) {
+        mp_limb_t v = ptr_arg2[i];
+        Z_LIMB(r)[i] = Z_LIMB(r)[i] ^ (v - c);
+        c = c && !v;
+      }
+      r = ml_z_reduce(r, size_arg1, 0);
+    }
+    else if (sign_arg1) {
+      /* arg1 < 0, arg2 > 0 => r < 0 */
+      r = ml_z_alloc(size_arg1 + 1);
+      Z_REFRESH(arg1);
+      Z_REFRESH(arg2);
+      mpn_sub_1(Z_LIMB(r), ptr_arg1, size_arg1, 1);
+      for (i = 0; i < size_arg2; i++)
+        Z_LIMB(r)[i] = Z_LIMB(r)[i] ^ ptr_arg2[i];
+      c = mpn_add_1(Z_LIMB(r), Z_LIMB(r), size_arg1, 1);
+      Z_LIMB(r)[size_arg1] = c;
+      r = ml_z_reduce(r, size_arg1 + 1, Z_SIGN_MASK);
+    }
+    else if (sign_arg2) {
+      /* arg1 > 0, arg2 < 0 => r < 0 */
+      r = ml_z_alloc(size_arg1 + 1);
+      Z_REFRESH(arg1);
+      Z_REFRESH(arg2);
+      mpn_sub_1(Z_LIMB(r), ptr_arg2, size_arg2, 1);
+      for (i = 0; i < size_arg2; i++)
+        Z_LIMB(r)[i] = ptr_arg1[i] ^ Z_LIMB(r)[i];
+      for (; i < size_arg1; i++)
+        Z_LIMB(r)[i] = ptr_arg1[i];
+      c = mpn_add_1(Z_LIMB(r), Z_LIMB(r), size_arg1, 1);
+      Z_LIMB(r)[size_arg1] = c;
+      r = ml_z_reduce(r, size_arg1 + 1, Z_SIGN_MASK);
+    }
+    else {
+      /* arg1, arg2 > 0 => r >= 0 */
+      r = ml_z_alloc(size_arg1);
+      Z_REFRESH(arg1);
+      Z_REFRESH(arg2);
+      for (i = 0; i < size_arg2; i++)
+        Z_LIMB(r)[i] = ptr_arg1[i] ^ ptr_arg2[i];
+      for (; i < size_arg1; i++)
+        Z_LIMB(r)[i] = ptr_arg1[i];
+      r = ml_z_reduce(r, size_arg1, 0);
+    }
+    Z_CHECK(r);
+    CAMLreturn(r);
+  }
+}
+
+CAMLprim value ml_z_lognot(value arg)
+{
+  Z_MARK_OP;
+  Z_CHECK(arg);
+#if Z_FAST_PATH && !Z_FAST_PATH_IN_OCAML
+  if (Is_long(arg)) {
+    /* fast path */
+    return (~arg) | 1;
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  {
+    CAMLparam1(arg);
+    Z_DECL(arg);
+    value r;
+    Z_ARG(arg);
+    r = ml_z_alloc(size_arg + 1);
+    Z_REFRESH(arg);
+    /* compute r = -arg - 1 */
+    if (!size_arg) {
+      /* arg = 0 => r = -1 */
+      Z_LIMB(r)[0] = 1;
+      r = ml_z_reduce(r, 1, Z_SIGN_MASK);
+    }
+    else if (sign_arg)  {
+      /* arg < 0, r > 0, |r| = |arg| - 1 */
+      mpn_sub_1(Z_LIMB(r), ptr_arg, size_arg, 1);
+      r = ml_z_reduce(r, size_arg, 0);
+    }
+    else {
+      /* arg > 0, r < 0, |r| = |arg| + 1 */
+      mp_limb_t c = mpn_add_1(Z_LIMB(r), ptr_arg, size_arg, 1);
+      Z_LIMB(r)[size_arg] = c;
+      r = ml_z_reduce(r, size_arg + 1, Z_SIGN_MASK);
+    }
+    Z_CHECK(r);
+    CAMLreturn(r);
+  }
+}
+
+CAMLprim value ml_z_shift_left(value arg, value count)
+{
+  Z_DECL(arg);
+  intnat c = Long_val(count);
+  intnat c1, c2;
+  Z_MARK_OP;
+  Z_CHECK(arg);
+  if (c < 0)
+    caml_invalid_argument("Z.shift_left: count argument must be positive");
+  if (!c) return arg;
+  c1 = c / Z_LIMB_BITS;
+  c2 = c % Z_LIMB_BITS;
+#if Z_FAST_PATH && !Z_FAST_PATH_IN_OCAML
+  if (Is_long(arg) && !c1) {
+    /* fast path */
+    value a = arg - 1;
+    value r = arg << c2;
+    if (a == (r >> c2)) return r | 1;
+  }
+#endif
+  Z_ARG(arg);
+  if (!size_arg) return Val_long(0);
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  {
+    CAMLparam1(arg);
+    value r;
+    mp_size_t i;
+    r = ml_z_alloc(size_arg + c1 + 1);
+    Z_REFRESH(arg);
+    /* 0-filled limbs */
+    for (i = 0; i < c1; i++) Z_LIMB(r)[i] = 0;
+    if (c2) {
+      /* shifted bits */
+      mp_limb_t x = mpn_lshift(Z_LIMB(r) + c1, ptr_arg, size_arg, c2);
+      Z_LIMB(r)[size_arg + c1] = x;
+    }
+    else {
+      /* unshifted copy */
+      ml_z_cpy_limb(Z_LIMB(r) + c1, ptr_arg, size_arg);
+      Z_LIMB(r)[size_arg + c1] = 0;
+    }
+    r = ml_z_reduce(r, size_arg + c1 + 1, sign_arg);
+    Z_CHECK(r);
+    CAMLreturn(r);
+  }
+}
+
+CAMLprim value ml_z_shift_right(value arg, value count)
+{
+  Z_DECL(arg);
+  intnat c = Long_val(count);
+  intnat c1, c2;
+  value r;
+  Z_MARK_OP;
+  Z_CHECK(arg);
+  if (c < 0)
+    caml_invalid_argument("Z.shift_right: count argument must be positive");
+  if (!c) return arg;
+  c1 = c / Z_LIMB_BITS;
+  c2 = c % Z_LIMB_BITS;
+#if Z_FAST_PATH && !Z_FAST_PATH_IN_OCAML
+  if (Is_long(arg)) {
+    /* fast path */
+    if (c1) {
+      if (arg < 0) return Val_long(-1);
+      else return Val_long(0);
+    }
+    return (arg >> c2) | 1;
+  }
+#endif
+  Z_ARG(arg);
+  if (c1 >= size_arg) {
+    if (sign_arg) return Val_long(-1);
+    else return Val_long(0);
+  }
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  {
+    CAMLparam1(arg);
+    mp_limb_t cr;
+    r = ml_z_alloc(size_arg - c1 + 1);
+    Z_REFRESH(arg);
+    if (c2)
+      /* shifted bits */
+      cr = mpn_rshift(Z_LIMB(r), ptr_arg + c1, size_arg - c1, c2);
+    else {
+      /* unshifted copy */
+      ml_z_cpy_limb(Z_LIMB(r), ptr_arg + c1, size_arg - c1);
+      cr = 0;
+    }
+    if (sign_arg) {
+      /* round |arg| to +oo */
+      mp_size_t i;
+      if (!cr) {
+      for (i = 0; i < c1; i++)
+        if (ptr_arg[i]) { cr = 1; break; }
+      }
+      if (cr)
+        cr = mpn_add_1(Z_LIMB(r), Z_LIMB(r), size_arg - c1, 1);
+    }
+    else cr = 0;
+    Z_LIMB(r)[size_arg - c1] = cr;
+    r = ml_z_reduce(r, size_arg - c1 + 1, sign_arg);
+    Z_CHECK(r);
+    CAMLreturn(r);
+  }
+}
+
+CAMLprim value ml_z_shift_right_trunc(value arg, value count)
+{
+  Z_DECL(arg);
+  intnat c = Long_val(count);
+  intnat c1, c2;
+  value r;
+  Z_MARK_OP;
+  Z_CHECK(arg);
+  if (c < 0)
+    caml_invalid_argument("Z.shift_right_trunc: count argument must be positive");
+  if (!c) return arg;
+  c1 = c / Z_LIMB_BITS;
+  c2 = c % Z_LIMB_BITS;
+#if Z_FAST_PATH && !Z_FAST_PATH_IN_OCAML
+  if (Is_long(arg)) {
+    /* fast path */
+    if (c1) return Val_long(0);
+    if (arg >= 1) return (arg >> c2) | 1;
+    else return Val_long(- ((- Long_val(arg)) >> c2));
+  }
+#endif
+  Z_ARG(arg);
+  if (c1 >= size_arg) return Val_long(0);
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  {
+    CAMLparam1(arg);
+    r = ml_z_alloc(size_arg - c1);
+    Z_REFRESH(arg);
+    if (c2)
+      /* shifted bits */
+      mpn_rshift(Z_LIMB(r), ptr_arg + c1, size_arg - c1, c2);
+    else
+      /* unshifted copy */
+    ml_z_cpy_limb(Z_LIMB(r), ptr_arg + c1, size_arg - c1);
+    r = ml_z_reduce(r, size_arg - c1, sign_arg);
+    Z_CHECK(r);
+    CAMLreturn(r);
+  }
+}
+
+/* Helper function for numbits: number of leading 0 bits in x */
+
+#ifdef _LONG_LONG_LIMB
+#define BUILTIN_CLZ __builtin_clzll
+#else
+#define BUILTIN_CLZ __builtin_clzl
+#endif
+
+/* Use GCC or Clang built-in if available.  The argument must be != 0. */
+#if defined(__clang__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
+#define ml_z_clz BUILTIN_CLZ
+#else
+/* Portable C implementation - Hacker's Delight fig 5.12 */
+int ml_z_clz(mp_limb_t x)
+{
+  int n;
+  mp_limb_t y;
+#ifdef ARCH_SIXTYFOUR
+  n = 64;
+  y = x >> 32;  if (y != 0) { n = n - 32; x = y; }
+#else
+  n = 32;
+#endif
+  y = x >> 16;  if (y != 0) { n = n - 16; x = y; }
+  y = x >>  8;  if (y != 0) { n = n -  8; x = y; }
+  y = x >>  4;  if (y != 0) { n = n -  4; x = y; }
+  y = x >>  2;  if (y != 0) { n = n -  2; x = y; }
+  y = x >>  1;  if (y != 0) return n - 2;
+  return n - x;
+}
+#endif
+
+CAMLprim value ml_z_numbits(value arg)
+{
+  Z_DECL(arg);
+  intnat r;
+  int n;
+  Z_MARK_OP;
+  Z_CHECK(arg);
+#if Z_FAST_PATH
+  if (Is_long(arg)) {
+    /* fast path */
+    r = Long_val(arg);
+    if (r == 0) {
+      return Val_int(0);
+    } else {
+      n = ml_z_clz(r > 0 ? r : -r);
+      return Val_long(sizeof(intnat) * 8 - n);
+    }
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  Z_ARG(arg);
+  if (size_arg == 0) return Val_int(0);
+  n = ml_z_clz(ptr_arg[size_arg - 1]);
+  return Val_long(size_arg * Z_LIMB_BITS - n);
+}
+
+/* Helper function for trailing_zeros: number of trailing 0 bits in x */
+
+#ifdef _LONG_LONG_LIMB
+#define BUILTIN_CTZ __builtin_ctzll
+#else
+#define BUILTIN_CTZ __builtin_ctzl
+#endif
+
+/* Use GCC or Clang built-in if available.  The argument must be != 0. */
+#if defined(__clang__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
+#define ml_z_ctz BUILTIN_CTZ
+#else
+/* Portable C implementation - Hacker's Delight fig 5.21 */
+int ml_z_ctz(mp_limb_t x)
+{
+  int n;
+  mp_limb_t y;
+  CAMLassert (x != 0);
+#ifdef ARCH_SIXTYFOUR
+  n = 63;
+  y = x << 32;  if (y != 0) { n = n - 32; x = y; }
+#else
+  n = 31;
+#endif
+  y = x << 16;  if (y != 0) { n = n - 16; x = y; }
+  y = x <<  8;  if (y != 0) { n = n -  8; x = y; }
+  y = x <<  4;  if (y != 0) { n = n -  4; x = y; }
+  y = x <<  2;  if (y != 0) { n = n -  2; x = y; }
+  y = x <<  1;  if (y != 0) { n = n - 1; }
+  return n;
+}
+#endif
+
+CAMLprim value ml_z_trailing_zeros(value arg)
+{
+  Z_DECL(arg);
+  intnat r;
+  mp_size_t i;
+  Z_MARK_OP;
+  Z_CHECK(arg);
+#if Z_FAST_PATH
+  if (Is_long(arg)) {
+    /* fast path */
+    r = Long_val(arg);
+    if (r == 0) {
+      return Val_long (Max_long);
+    } else {
+      /* No need to take absolute value of r, as ctz(-x) = ctz(x) */
+      return Val_long (ml_z_ctz(r));
+    }
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  Z_ARG(arg);
+  if (size_arg == 0) return Val_long (Max_long);
+  for (i = 0; ptr_arg[i] == 0; i++) /* skip */;
+  return Val_long(i * Z_LIMB_BITS + ml_z_ctz(ptr_arg[i]));
+}
+
+/* helper function for popcount & hamdist: number of bits at 1 in x */
+/* maybe we should use the mpn_ function even for small arguments, in case
+   the CPU has a fast popcount opcode?
+ */
+uintnat ml_z_count(uintnat x)
+{
+#ifdef ARCH_SIXTYFOUR
+  x = (x & 0x5555555555555555UL) + ((x >>  1) & 0x5555555555555555UL);
+  x = (x & 0x3333333333333333UL) + ((x >>  2) & 0x3333333333333333UL);
+  x = (x & 0x0f0f0f0f0f0f0f0fUL) + ((x >>  4) & 0x0f0f0f0f0f0f0f0fUL);
+  x = (x & 0x00ff00ff00ff00ffUL) + ((x >>  8) & 0x00ff00ff00ff00ffUL);
+  x = (x & 0x0000ffff0000ffffUL) + ((x >> 16) & 0x0000ffff0000ffffUL);
+  x = (x & 0x00000000ffffffffUL) + ((x >> 32) & 0x00000000ffffffffUL);
+#else
+  x = (x & 0x55555555UL) + ((x >>  1) & 0x55555555UL);
+  x = (x & 0x33333333UL) + ((x >>  2) & 0x33333333UL);
+  x = (x & 0x0f0f0f0fUL) + ((x >>  4) & 0x0f0f0f0fUL);
+  x = (x & 0x00ff00ffUL) + ((x >>  8) & 0x00ff00ffUL);
+  x = (x & 0x0000ffffUL) + ((x >> 16) & 0x0000ffffUL);
+#endif
+  return x;
+}
+
+CAMLprim value ml_z_popcount(value arg)
+{
+  Z_DECL(arg);
+  intnat r;
+  Z_MARK_OP;
+  Z_CHECK(arg);
+#if Z_FAST_PATH
+  if (Is_long(arg)) {
+    /* fast path */
+    r = Long_val(arg);
+    if (r < 0) ml_z_raise_overflow();
+    return Val_long(ml_z_count(r));
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  Z_ARG(arg);
+  if (sign_arg) ml_z_raise_overflow();
+  if (!size_arg) return Val_long(0);
+  r =  mpn_popcount(ptr_arg, size_arg);
+  if (r < 0 || !Z_FITS_INT(r)) ml_z_raise_overflow();
+  return Val_long(r);
+}
+
+CAMLprim value ml_z_hamdist(value arg1, value arg2)
+{
+  Z_DECL(arg1); Z_DECL(arg2);
+  intnat r;
+  mp_size_t sz;
+  Z_MARK_OP;
+  Z_CHECK(arg1);
+  Z_CHECK(arg2);
+#if Z_FAST_PATH
+  if (Is_long(arg1) && Is_long(arg2)) {
+    /* fast path */
+    r = Long_val(arg1) ^ Long_val(arg2);
+    if (r < 0) ml_z_raise_overflow();
+    return Val_long(ml_z_count(r));
+  }
+#endif
+  /* mpn_ version */
+  Z_MARK_SLOW;
+  Z_ARG(arg1);
+  Z_ARG(arg2);
+  if (sign_arg1 != sign_arg2) ml_z_raise_overflow();
+  /* XXX TODO: case where arg1 & arg2 are both negative */
+  if (sign_arg1 || sign_arg2)
+    caml_invalid_argument("Z.hamdist: negative arguments");
+  /* distance on common size */
+  sz = (size_arg1 <= size_arg2) ? size_arg1 : size_arg2;
+  if (sz) {
+    r =  mpn_hamdist(ptr_arg1, ptr_arg2, sz);
+    if (r < 0 || !Z_FITS_INT(r)) ml_z_raise_overflow();
+  }
+  else r = 0;
+  /* add stray bits */
+  if (size_arg1 > size_arg2) {
+    r += mpn_popcount(ptr_arg1 + size_arg2, size_arg1 - size_arg2);
+    if (r < 0 || !Z_FITS_INT(r)) ml_z_raise_overflow();
+  }
+  else if (size_arg2 > size_arg1) {
+    r += mpn_popcount(ptr_arg2 + size_arg1, size_arg2 - size_arg1);
+    if (r < 0 || !Z_FITS_INT(r)) ml_z_raise_overflow();
+  }
+  return Val_long(r);
+}
+
+CAMLprim value ml_z_testbit(value arg, value index)
+{
+  Z_DECL(arg);
+  uintnat b_idx;
+  mp_size_t l_idx, i;
+  mp_limb_t limb;
+  Z_MARK_OP;
+  Z_CHECK(arg);
+  b_idx = Long_val(index); /* Caml code checked index >= 0 */
+#if Z_FAST_PATH
+  if (Is_long(arg)) {
+    if (b_idx >= Z_LIMB_BITS) b_idx = Z_LIMB_BITS - 1;
+    return Val_int((Long_val(arg) >> b_idx) & 1);
+  }
+#endif
+  Z_MARK_SLOW;
+  Z_ARG(arg);
+  l_idx = b_idx / Z_LIMB_BITS;
+  if (l_idx >= size_arg) return Val_bool(sign_arg);
+  limb = ptr_arg[l_idx];
+  if (sign_arg != 0) {
+    /* If arg is negative, its 2-complement representation is
+       bitnot(abs(arg) - 1).
+       If any of the limbs of abs(arg) below l_idx is nonzero,
+       the carry from the decrement dies before reaching l_idx,
+       and we just test bitnot(limb).
+       If all the limbs below l_idx are zero, the carry from the
+       decrement propagates to l_idx,
+       and we test bitnot(limb - 1) = - limb. */
+    for (i = 0; i < l_idx; i++) {
+      if (ptr_arg[i] != 0) { limb = ~limb; goto extract; }
+    }
+    limb = -limb;
+  }
+ extract:
+  return Val_int((limb >> (b_idx % Z_LIMB_BITS)) & 1);
+}
+
+/*---------------------------------------------------
+  FUNCTIONS BASED ON mpz_t
+  ---------------------------------------------------*/
+
+/* sets rop to the value in op (limbs are copied) */
+void ml_z_mpz_set_z(mpz_t rop, value op)
+{
+  Z_DECL(op);
+  Z_CHECK(op);
+  Z_ARG(op);
+  if (size_op * Z_LIMB_BITS > INT_MAX)
+    caml_invalid_argument("Z: risk of overflow in mpz type");
+  mpz_realloc2(rop, size_op * Z_LIMB_BITS);
+  rop->_mp_size = (sign_op >= 0) ? size_op : -size_op;
+  ml_z_cpy_limb(rop->_mp_d, ptr_op, size_op);
+}
+
+/* inits and sets rop to the value in op (limbs are copied) */
+void ml_z_mpz_init_set_z(mpz_t rop, value op)
+{
+  mpz_init(rop);
+  ml_z_mpz_set_z(rop,op);
+}
+
+/* returns a new z objects equal to op (limbs are copied) */
+value ml_z_from_mpz(mpz_t op)
+{
+  value r;
+  size_t sz = mpz_size(op);
+  r = ml_z_alloc(sz);
+  ml_z_cpy_limb(Z_LIMB(r), op->_mp_d, sz);
+  return ml_z_reduce(r, sz, (mpz_sgn(op) >= 0) ? 0 : Z_SIGN_MASK);
+}
+
+#if __GNU_MP_VERSION >= 5
+/* not exported by gmp.h */
+extern void __gmpn_divexact (mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t);
+#endif
+
+CAMLprim value ml_z_divexact(value arg1, value arg2)
+{
+  Z_DECL(arg1); Z_DECL(arg2);
+  Z_MARK_OP;
+  Z_CHECK(arg1); Z_CHECK(arg2);
+#if Z_FAST_PATH && !Z_FAST_PATH_IN_OCAML
+  if (Is_long(arg1) && Is_long(arg2)) {
+    /* fast path */
+    intnat a1 = Long_val(arg1);
+    intnat a2 = Long_val(arg2);
+    intnat q;
+    if (!a2) ml_z_raise_divide_by_zero();
+    q = a1 / a2;
+    if (Z_FITS_INT(q)) return Val_long(q);
+  }
+#endif
+  Z_MARK_SLOW;
+#if __GNU_MP_VERSION >= 5
+  {
+    /* mpn_ version */
+    Z_ARG(arg1);
+    Z_ARG(arg2);
+    if (!size_arg2)
+      ml_z_raise_divide_by_zero();
+    if (size_arg1 < size_arg2)
+      return Val_long(0);
+    {
+      CAMLparam2(arg1,arg2);
+      CAMLlocal1(q);
+      q = ml_z_alloc(size_arg1 - size_arg2 + 1);
+      Z_REFRESH(arg1); Z_REFRESH(arg2);
+      __gmpn_divexact(Z_LIMB(q),
+                      ptr_arg1,  size_arg1, ptr_arg2, size_arg2);
+      q = ml_z_reduce(q, size_arg1 - size_arg2 + 1, sign_arg1 ^ sign_arg2);
+      Z_CHECK(q);
+      CAMLreturn(q);
+    }
+  }
+#else
+  {
+    /* mpz_ version */
+    CAMLparam2(arg1,arg2);
+    CAMLlocal1(r);
+    mpz_t a,b;
+    if (!ml_z_sgn(arg2))
+      ml_z_raise_divide_by_zero();
+    ml_z_mpz_init_set_z(a, arg1);
+    ml_z_mpz_init_set_z(b, arg2);
+    mpz_divexact(a, a, b);
+    r = ml_z_from_mpz(a);
+    mpz_clear(a);
+    mpz_clear(b);
+    CAMLreturn(r);
+  }
+#endif
+}
+
+CAMLprim value ml_z_powm(value base, value exp, value mod)
+{
+  CAMLparam3(base,exp,mod);
+  CAMLlocal1(r);
+  Z_DECL(mod);
+  mpz_t mbase, mexp, mmod;
+  Z_ARG(mod);
+  if (!size_mod)
+    ml_z_raise_divide_by_zero();
+  ml_z_mpz_init_set_z(mbase, base);
+  ml_z_mpz_init_set_z(mexp, exp);
+  ml_z_mpz_init_set_z(mmod, mod);
+  if (mpz_sgn(mexp) < 0) {
+    /* we need to check whether base is invertible to avoid a division by zero
+       in mpz_powm, so we can as well use the computed inverse
+     */
+    if (!mpz_invert(mbase, mbase, mmod)) {
+      mpz_clear(mbase);
+      mpz_clear(mexp);
+      mpz_clear(mmod);
+      ml_z_raise_divide_by_zero();
+    }
+    mpz_neg(mexp, mexp);
+  }
+  mpz_powm(mbase, mbase, mexp, mmod);
+  r = ml_z_from_mpz(mbase);
+  mpz_clear(mbase);
+  mpz_clear(mexp);
+  mpz_clear(mmod);
+  CAMLreturn(r);
+}
+
+CAMLprim value ml_z_powm_sec(value base, value exp, value mod)
+{
+#ifndef HAS_MPIR
+#if __GNU_MP_VERSION >= 5
+  CAMLparam3(base,exp,mod);
+  CAMLlocal1(r);
+  mpz_t mbase, mexp, mmod;
+  ml_z_mpz_init_set_z(mbase, base);
+  ml_z_mpz_init_set_z(mexp, exp);
+  ml_z_mpz_init_set_z(mmod, mod);
+  if (mpz_sgn(mexp) <= 0) {
+    mpz_clear(mbase);
+    mpz_clear(mexp);
+    mpz_clear(mmod);
+    caml_invalid_argument("Z.powm_sec: exponent must be positive");
+  }
+  if (! mpz_odd_p(mmod)) {
+    mpz_clear(mbase);
+    mpz_clear(mexp);
+    mpz_clear(mmod);
+    caml_invalid_argument("Z.powm_sec: modulus must be odd");
+  }
+  mpz_powm_sec(mbase, mbase, mexp, mmod);
+  r = ml_z_from_mpz(mbase);
+  mpz_clear(mbase);
+  mpz_clear(mexp);
+  mpz_clear(mmod);
+  CAMLreturn(r);
+#else
+  MAYBE_UNUSED(base);
+  MAYBE_UNUSED(exp);
+  MAYBE_UNUSED(mod);
+  caml_invalid_argument("Z.powm_sec: not available, needs GMP version >= 5");
+#endif
+#else
+  MAYBE_UNUSED(base);
+  MAYBE_UNUSED(exp);
+  MAYBE_UNUSED(mod);
+  caml_invalid_argument("Z.powm_sec: not available in MPIR, needs GMP version >= 5");
+#endif
+}
+
+CAMLprim value ml_z_pow(value base, value exp)
+{
+  CAMLparam2(base,exp);
+  CAMLlocal1(r);
+  mpz_t mbase;
+  intnat e = Long_val(exp);
+  mp_size_t sz, ralloc;
+  int cnt;
+  if (e < 0)
+    caml_invalid_argument("Z.pow: exponent must be nonnegative");
+  ml_z_mpz_init_set_z(mbase, base);
+
+  /* Safe overapproximation of the size of the result.
+     In case this overflows an int, GMP may abort with a message
+     "gmp: overflow in mpz type". To avoid this, we test the size before
+     calling mpz_pow_ui and raise an OCaml exception.
+     Note: we lifted the computation from mpz_n_pow_ui.
+   */
+  sz = mbase->_mp_size;
+  if (sz < 0) sz = -sz;
+  cnt = sz > 0 ? ml_z_clz(mbase->_mp_d[sz - 1]) : 0;
+  ralloc = (sz * GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
+  if (ralloc > INT_MAX) {
+    mpz_clear(mbase);
+    caml_invalid_argument("Z.pow: risk of overflow in mpz type");
+  }
+  mpz_pow_ui(mbase, mbase, e);
+  r = ml_z_from_mpz(mbase);
+  mpz_clear(mbase);
+  CAMLreturn(r);
+}
+
+CAMLprim value ml_z_root(value a, value b)
+{
+  CAMLparam2(a,b);
+  CAMLlocal1(r);
+  Z_DECL(a);
+  mpz_t ma;
+  intnat mb = Long_val(b);
+  if (mb <= 0)
+    caml_invalid_argument("Z.root: exponent must be positive");
+  Z_ARG(a);
+  if (!(mb & 1) && sign_a)
+    caml_invalid_argument("Z.root: even root of a negative number");
+  ml_z_mpz_init_set_z(ma, a);
+  mpz_root(ma, ma, mb);
+  r = ml_z_from_mpz(ma);
+  mpz_clear(ma);
+  CAMLreturn(r);
+}
+
+CAMLprim value ml_z_rootrem(value a, value b)
+{
+  CAMLparam2(a,b);
+  CAMLlocal3(r1,r2,r3);
+  Z_DECL(a);
+  mpz_t ma, mr1, mr2;
+  intnat mb = Long_val(b);
+  if (mb <= 0)
+    caml_invalid_argument("Z.rootrem: exponent must be positive");
+  Z_ARG(a);
+  if (!(mb & 1) && sign_a)
+    caml_invalid_argument("Z.rootrem: even root of a negative number");
+  ml_z_mpz_init_set_z(ma, a);
+  mpz_init(mr1);
+  mpz_init(mr2);
+  mpz_rootrem(mr1, mr2, ma, mb);
+  r1 = ml_z_from_mpz(mr1);
+  r2 = ml_z_from_mpz(mr2);
+  r3 = caml_alloc_small(2, 0);
+  Field(r3,0) = r1;
+  Field(r3,1) = r2;
+  mpz_clear(ma);
+  mpz_clear(mr1);
+  mpz_clear(mr2);
+  CAMLreturn(r3);
+}
+
+CAMLprim value ml_z_perfect_power(value a)
+{
+  CAMLparam1(a);
+  int r;
+  mpz_t ma;
+  ml_z_mpz_init_set_z(ma, a);
+  r = mpz_perfect_power_p(ma);
+  mpz_clear(ma);
+  CAMLreturn(r ? Val_true : Val_false);
+}
+
+CAMLprim value ml_z_perfect_square(value a)
+{
+  CAMLparam1(a);
+  int r;
+  mpz_t ma;
+  ml_z_mpz_init_set_z(ma, a);
+  r = mpz_perfect_square_p(ma);
+  mpz_clear(ma);
+  CAMLreturn(r ? Val_true : Val_false);
+}
+
+CAMLprim value ml_z_probab_prime(value a, int b)
+{
+  CAMLparam1(a);
+  int r;
+  mpz_t ma;
+  ml_z_mpz_init_set_z(ma, a);
+  r = mpz_probab_prime_p(ma, Int_val(b));
+  mpz_clear(ma);
+  CAMLreturn(Val_int(r));
+}
+
+CAMLprim value ml_z_nextprime(value a)
+{
+  CAMLparam1(a);
+  CAMLlocal1(r);
+  mpz_t ma;
+  ml_z_mpz_init_set_z(ma, a);
+  mpz_nextprime(ma, ma);
+  r = ml_z_from_mpz(ma);
+  mpz_clear(ma);
+  CAMLreturn(r);
+}
+
+CAMLprim value ml_z_invert(value base, value mod)
+{
+  CAMLparam2(base,mod);
+  CAMLlocal1(r);
+  mpz_t mbase, mmod;
+  ml_z_mpz_init_set_z(mbase, base);
+  ml_z_mpz_init_set_z(mmod, mod);
+  if (!mpz_invert(mbase, mbase, mmod)) {
+    mpz_clear(mbase);
+    mpz_clear(mmod);
+    ml_z_raise_divide_by_zero();
+  }
+  r = ml_z_from_mpz(mbase);
+  mpz_clear(mbase);
+  mpz_clear(mmod);
+  CAMLreturn(r);
+}
+
+CAMLprim value ml_z_divisible(value a, value b)
+{
+  CAMLparam2(a,b);
+  mpz_t ma, mb;
+  int r;
+  ml_z_mpz_init_set_z(ma, a);
+  ml_z_mpz_init_set_z(mb, b);
+  r = mpz_divisible_p(ma, mb);
+  mpz_clear(ma);
+  mpz_clear(mb);
+  CAMLreturn(Val_bool(r));
+}
+
+CAMLprim value ml_z_congruent(value a, value b, value c)
+{
+  CAMLparam3(a,b,c);
+  mpz_t ma, mb, mc;
+  int r;
+  ml_z_mpz_init_set_z(ma, a);
+  ml_z_mpz_init_set_z(mb, b);
+  ml_z_mpz_init_set_z(mc, c);
+  r = mpz_congruent_p(ma, mb, mc);
+  mpz_clear(ma);
+  mpz_clear(mb);
+  mpz_clear(mc);
+  CAMLreturn(Val_bool(r));
+}
+
+CAMLprim value ml_z_jacobi(value a, value b)
+{
+  CAMLparam2(a,b);
+  mpz_t ma, mb;
+  int r;
+  ml_z_mpz_init_set_z(ma, a);
+  ml_z_mpz_init_set_z(mb, b);
+  r = mpz_jacobi(ma, mb);
+  mpz_clear(ma);
+  mpz_clear(mb);
+  CAMLreturn(Val_int(r));
+}
+
+CAMLprim value ml_z_legendre(value a, value b)
+{
+  CAMLparam2(a,b);
+  mpz_t ma, mb;
+  int r;
+  ml_z_mpz_init_set_z(ma, a);
+  ml_z_mpz_init_set_z(mb, b);
+  r = mpz_legendre(ma, mb);
+  mpz_clear(ma);
+  mpz_clear(mb);
+  CAMLreturn(Val_int(r));
+}
+
+CAMLprim value ml_z_kronecker(value a, value b)
+{
+  CAMLparam2(a,b);
+  mpz_t ma, mb;
+  int r;
+  ml_z_mpz_init_set_z(ma, a);
+  ml_z_mpz_init_set_z(mb, b);
+  r = mpz_kronecker(ma, mb);
+  mpz_clear(ma);
+  mpz_clear(mb);
+  CAMLreturn(Val_int(r));
+}
+
+CAMLprim value ml_z_remove(value a, value b)
+{
+  CAMLparam2(a,b);
+  CAMLlocal1(r);
+  mpz_t ma, mb, mr;
+  int i;
+  ml_z_mpz_init_set_z(ma, a);
+  ml_z_mpz_init_set_z(mb, b);
+  mpz_init(mr);
+  i = mpz_remove(mr, ma, mb);
+  r = caml_alloc_small(2, 0);
+  Field(r,0) = ml_z_from_mpz(mr);
+  Field(r,1) = Val_int(i);
+  mpz_clear(ma);
+  mpz_clear(mb);
+  mpz_clear(mr);
+  CAMLreturn(r);
+}
+
+CAMLprim value ml_z_fac(value a)
+{
+  CAMLparam1(a);
+  CAMLlocal1(r);
+  mpz_t mr;
+  intnat ma = Long_val(a);
+  if (ma < 0)
+    caml_invalid_argument("Z.fac: non-positive argument");
+  mpz_init(mr);
+  mpz_fac_ui(mr, ma);
+  r = ml_z_from_mpz(mr);
+  mpz_clear(mr);
+  CAMLreturn(r);
+}
+
+CAMLprim value ml_z_fac2(value a)
+{
+  CAMLparam1(a);
+  CAMLlocal1(r);
+  mpz_t mr;
+  intnat ma = Long_val(a);
+  if (ma < 0)
+    caml_invalid_argument("Z.fac2: non-positive argument");
+  mpz_init(mr);
+  mpz_2fac_ui(mr, ma);
+  r = ml_z_from_mpz(mr);
+  mpz_clear(mr);
+  CAMLreturn(r);
+}
+
+CAMLprim value ml_z_facM(value a, value b)
+{
+  CAMLparam2(a,b);
+  CAMLlocal1(r);
+  mpz_t mr;
+  intnat ma = Long_val(a), mb = Long_val(b);
+  if (ma < 0 || mb < 0)
+    caml_invalid_argument("Z.facM: non-positive argument");
+  mpz_init(mr);
+  mpz_mfac_uiui(mr, ma, mb);
+  r = ml_z_from_mpz(mr);
+  mpz_clear(mr);
+  CAMLreturn(r);
+}
+
+CAMLprim value ml_z_primorial(value a)
+{
+  CAMLparam1(a);
+  CAMLlocal1(r);
+  mpz_t mr;
+  intnat ma = Long_val(a);
+  if (ma < 0)
+    caml_invalid_argument("Z.primorial: non-positive argument");
+  mpz_init(mr);
+  mpz_primorial_ui(mr, ma);
+  r = ml_z_from_mpz(mr);
+  mpz_clear(mr);
+  CAMLreturn(r);
+}
+
+CAMLprim value ml_z_bin(value a, value b)
+{
+  CAMLparam2(a,b);
+  CAMLlocal1(r);
+  mpz_t ma;
+  intnat mb = Long_val(b);
+  if (mb < 0)
+    caml_invalid_argument("Z.bin: non-positive argument");
+  ml_z_mpz_init_set_z(ma, a);
+  mpz_bin_ui(ma, ma, mb);
+  r = ml_z_from_mpz(ma);
+  mpz_clear(ma);
+  CAMLreturn(r);
+}
+
+CAMLprim value ml_z_fib(value a)
+{
+  CAMLparam1(a);
+  CAMLlocal1(r);
+  mpz_t mr;
+  intnat ma = Long_val(a);
+  if (ma < 0)
+    caml_invalid_argument("Z.fib: non-positive argument");
+  mpz_init(mr);
+  mpz_fib_ui(mr, ma);
+  r = ml_z_from_mpz(mr);
+  mpz_clear(mr);
+  CAMLreturn(r);
+}
+
+CAMLprim value ml_z_lucnum(value a)
+{
+  CAMLparam1(a);
+  CAMLlocal1(r);
+  mpz_t mr;
+  intnat ma = Long_val(a);
+  if (ma < 0)
+    caml_invalid_argument("Z.lucnum: non-positive argument");
+  mpz_init(mr);
+  mpz_lucnum_ui(mr, ma);
+  r = ml_z_from_mpz(mr);
+  mpz_clear(mr);
+  CAMLreturn(r);
+}
+
+
+
+/* XXX should we support the following?
+   mpz_scan0, mpz_scan1
+   mpz_setbit, mpz_clrbit, mpz_combit, mpz_tstbit
+   mpz_odd_p, mpz_even_p
+   random numbers
+*/
+
+
+
+/*---------------------------------------------------
+  CUSTOMS BLOCKS
+  ---------------------------------------------------*/
+
+/* With OCaml < 3.12.1, comparing a block an int with OCaml's
+   polymorphic compare will give erroneous results (int always
+   strictly smaller than block).  OCaml 3.12.1 and above
+   give the correct result.
+*/
+int ml_z_custom_compare(value arg1, value arg2)
+{
+  Z_DECL(arg1);  Z_DECL(arg2);
+  int r;
+  Z_CHECK(arg1); Z_CHECK(arg2);
+#if Z_FAST_PATH
+  /* Value-equal small integers are equal.
+     Pointer-equal big integers are equal as well. */
+  if (arg1 == arg2) return 0;
+  if (Is_long(arg2)) {
+    if (Is_long(arg1)) {
+      return arg1 > arg2 ? 1 : -1;
+    } else {
+      /* Either arg1 is positive and arg1 > Z_MAX_INT >= arg2 -> result +1
+             or arg1 is negative and arg1 < Z_MIN_INT <= arg2 -> result -1 */
+      return Z_SIGN(arg1) ? -1 : 1;
+    }
+  }
+  else if (Is_long(arg1)) {
+    /* Either arg2 is positive and arg2 > Z_MAX_INT >= arg1 -> result -1
+           or arg2 is negative and arg2 < Z_MIN_INT <= arg1 -> result +1 */
+    return Z_SIGN(arg2) ? 1 : -1;
+  }
+#endif
+  r = 0;
+  Z_ARG(arg1);
+  Z_ARG(arg2);
+  if (sign_arg1 != sign_arg2) r = 1;
+  else if (size_arg1 > size_arg2) r = 1;
+  else if (size_arg1 < size_arg2) r = -1;
+  else {
+    mp_size_t i;
+    for (i = size_arg1 - 1; i >= 0; i--) {
+      if (ptr_arg1[i] > ptr_arg2[i]) { r = 1; break; }
+      if (ptr_arg1[i] < ptr_arg2[i]) { r = -1; break; }
+    }
+  }
+  if (sign_arg1) r = -r;
+  return r;
+}
+
+static intnat ml_z_custom_hash(value v)
+{
+  Z_DECL(v);
+  mp_size_t i;
+  uint32_t acc = 0;
+  Z_CHECK(v);
+  Z_ARG(v);
+  for (i = 0; i < size_v; i++) {
+    acc = caml_hash_mix_uint32(acc, (uint32_t)(ptr_v[i]));
+#ifdef ARCH_SIXTYFOUR
+    acc = caml_hash_mix_uint32(acc, ptr_v[i] >> 32);
+#endif
+  }
+#ifndef ARCH_SIXTYFOUR
+  /* To obtain the same hash value on 32- and 64-bit platforms */
+  if (size_v % 2 != 0)
+    acc = caml_hash_mix_uint32(acc, 0);
+#endif
+  if (sign_v) acc++;
+  return acc;
+}
+
+CAMLprim value ml_z_hash(value v)
+{
+  return Val_long(ml_z_custom_hash(v));
+}
+
+/* serialized format:
+   - 1-byte sign (1 for negative, 0 for positive)
+   - 4-byte size in bytes
+   - size-byte unsigned integer, in little endian order
+ */
+static void ml_z_custom_serialize(value v,
+                                  uintnat * wsize_32,
+                                  uintnat * wsize_64)
+{
+  mp_size_t i,nb;
+  Z_DECL(v);
+  Z_CHECK(v);
+  Z_ARG(v);
+  if ((mp_size_t)(uint32_t) size_v != size_v)
+    caml_failwith("Z.serialize: number is too large");
+  nb = size_v * sizeof(mp_limb_t);
+  caml_serialize_int_1(sign_v ? 1 : 0);
+  caml_serialize_int_4(nb);
+  for (i = 0; i < size_v; i++) {
+    mp_limb_t x = ptr_v[i];
+    caml_serialize_int_1(x);
+    caml_serialize_int_1(x >> 8);
+    caml_serialize_int_1(x >> 16);
+    caml_serialize_int_1(x >> 24);
+#ifdef ARCH_SIXTYFOUR
+    caml_serialize_int_1(x >> 32);
+    caml_serialize_int_1(x >> 40);
+    caml_serialize_int_1(x >> 48);
+    caml_serialize_int_1(x >> 56);
+#endif
+  }
+  *wsize_32 = 4 * (1 + (nb + 3) / 4);
+  *wsize_64 = 8 * (1 + (nb + 7) / 8);
+#if Z_PERFORM_CHECK
+  /* Add space for canary */
+  *wsize_32 += 4;
+  *wsize_64 += 8;
+#endif
+}
+
+/* XXX: serializing a large (i.e., > 2^31) int on a 64-bit machine and
+   deserializing on a 32-bit machine will fail (instead of returning a
+   block).
+ */
+static uintnat ml_z_custom_deserialize(void * dst)
+{
+  mp_limb_t* d = ((mp_limb_t*)dst) + 1;
+  int sign = caml_deserialize_uint_1();
+  uint32_t sz = caml_deserialize_uint_4();
+  uint32_t szw = (sz + sizeof(mp_limb_t) - 1) / sizeof(mp_limb_t);
+  uint32_t i = 0;
+  mp_limb_t x;
+  /* all limbs but last */
+  if (szw > 1) {
+    for (; i < szw - 1; i++) {
+      x = caml_deserialize_uint_1();
+      x |= ((mp_limb_t) caml_deserialize_uint_1()) << 8;
+      x |= ((mp_limb_t) caml_deserialize_uint_1()) << 16;
+      x |= ((mp_limb_t) caml_deserialize_uint_1()) << 24;
+#ifdef ARCH_SIXTYFOUR
+      x |= ((mp_limb_t) caml_deserialize_uint_1()) << 32;
+      x |= ((mp_limb_t) caml_deserialize_uint_1()) << 40;
+      x |= ((mp_limb_t) caml_deserialize_uint_1()) << 48;
+      x |= ((mp_limb_t) caml_deserialize_uint_1()) << 56;
+#endif
+      d[i] = x;
+    }
+    sz -= i * sizeof(mp_limb_t);
+  }
+  /* last limb */
+  if (sz > 0) {
+    x = caml_deserialize_uint_1();
+    if (sz > 1) x |= ((mp_limb_t) caml_deserialize_uint_1()) << 8;
+    if (sz > 2) x |= ((mp_limb_t) caml_deserialize_uint_1()) << 16;
+    if (sz > 3) x |= ((mp_limb_t) caml_deserialize_uint_1()) << 24;
+#ifdef ARCH_SIXTYFOUR
+    if (sz > 4) x |= ((mp_limb_t) caml_deserialize_uint_1()) << 32;
+    if (sz > 5) x |= ((mp_limb_t) caml_deserialize_uint_1()) << 40;
+    if (sz > 6) x |= ((mp_limb_t) caml_deserialize_uint_1()) << 48;
+    if (sz > 7) x |= ((mp_limb_t) caml_deserialize_uint_1()) << 56;
+#endif
+    d[i] = x;
+    i++;
+  }
+  while (i > 0 && !d[i-1]) i--;
+  d[-1] = i | (sign ? Z_SIGN_MASK : 0);
+#if Z_PERFORM_CHECK
+  d[szw] = 0xDEADBEEF ^ szw;
+  szw++;
+#endif
+  return (szw+1) * sizeof(mp_limb_t);
+}
+
+struct custom_operations ml_z_custom_ops = {
+  /* Identifiers starting with _ are normally reserved for the OCaml runtime
+     system, but we got authorization form Gallium to use "_z".
+     It is very compact and stays in the spirit of identifiers used for
+     int32 & co ("_i" & co.).
+  */
+  "_z",
+  custom_finalize_default,
+  ml_z_custom_compare,
+  ml_z_custom_hash,
+  ml_z_custom_serialize,
+  ml_z_custom_deserialize,
+  ml_z_custom_compare,
+#ifndef Z_OCAML_LEGACY_CUSTOM_OPERATIONS
+  custom_fixed_length_default
+#endif
+};
+
+
+/*---------------------------------------------------
+  CONVERSION WITH MLGMPIDL
+  ---------------------------------------------------*/
+
+CAMLprim value ml_z_mlgmpidl_of_mpz(value a)
+{
+  CAMLparam1(a);
+  mpz_ptr mpz = (mpz_ptr)(Data_custom_val(a));
+  CAMLreturn(ml_z_from_mpz(mpz));
+}
+
+/* stores the Z.t object into an existing Mpz.t one;
+   as we never allocate Mpz.t objects, we don't need any pointer to
+   mlgmpidl's custom block ops, and so, can link the function even if
+   mlgmpidl is not installed
+ */
+CAMLprim value ml_z_mlgmpidl_set_mpz(value r, value a)
+{
+  CAMLparam2(r,a);
+  mpz_ptr mpz = (mpz_ptr)(Data_custom_val(r));
+  ml_z_mpz_set_z(mpz,a);
+  CAMLreturn(Val_unit);
+}
+
+
+
+/*---------------------------------------------------
+  INIT / EXIT
+  ---------------------------------------------------*/
+
+/* called at program exit to display performance information */
+#if Z_PERF_COUNTER
+static void ml_z_dump_count()
+{
+  printf("Z: %lu asm operations, %lu C operations, %lu slow (%lu%%)\n",
+         ml_z_ops_as, ml_z_ops, ml_z_slow,
+         ml_z_ops ? (ml_z_slow*100/(ml_z_ops+ml_z_ops_as)) : 0);
+}
+#endif
+
+CAMLprim value ml_z_init()
+{
+  ml_z_2p32 = ldexp(1., 32);
+  /* run-time checks */
+#ifdef ARCH_SIXTYFOUR
+  if (sizeof(intnat) != 8 || sizeof(mp_limb_t) != 8)
+    caml_failwith("Z.init: invalid size of types, 8 expected");
+#else
+  if (sizeof(intnat) != 4 || sizeof(mp_limb_t) != 4)
+    caml_failwith("Z.init: invalid size of types, 4 expected");
+#endif
+  /* install functions */
+#if Z_PERF_COUNTER
+  atexit(ml_z_dump_count);
+#endif
+#if Z_CUSTOM_BLOCK
+  caml_register_custom_operations(&ml_z_custom_ops);
+#endif
+  return Val_unit;
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/Zarith/zarith.h b/Zarith/zarith.h
new file mode 100644
index 000000000000..b6ce28798882
--- /dev/null
+++ b/Zarith/zarith.h
@@ -0,0 +1,46 @@
+/* This file is cloned from
+   https://github.com/ocaml/Zarith/blob/39df015/zarith.h
+*/
+
+/**
+  Public C interface for Zarith.
+
+  This is intended for C libraries that wish to convert between mpz_t and
+  Z.t objects.
+
+
+  This file is part of the Zarith library
+  http://forge.ocamlcore.org/projects/zarith .
+  It is distributed under LGPL 2 licensing, with static linking exception.
+  See the LICENSE file included in the distribution.
+
+  Copyright (c) 2010-2011 Antoine Miné, Abstraction project.
+  Abstraction is part of the LIENS (Laboratoire d'Informatique de l'ENS),
+  a joint laboratory by:
+  CNRS (Centre national de la recherche scientifique, France),
+  ENS (École normale supérieure, Paris, France),
+  INRIA Rocquencourt (Institut national de recherche en informatique, France).
+
+*/
+
+
+/* gmp.h or mpir.h must be included manually before zarith.h */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <caml/mlvalues.h>
+
+/* sets rop to the value in op (limbs are copied) */
+void ml_z_mpz_set_z(mpz_t rop, value op);
+
+/* inits and sets rop to the value in op (limbs are copied) */
+void ml_z_mpz_init_set_z(mpz_t rop, value op);
+
+/* returns a new z objects equal to op (limbs are copied) */
+value ml_z_from_mpz(mpz_t op);
+
+#ifdef __cplusplus
+}
+#endif

From 22e2529b3f7da13f3136c2ac645ecf38f27bc2af Mon Sep 17 00:00:00 2001
From: Gabriel Scherer <gabriel.scherer@gmail.com>
Date: Wed, 30 Jun 2021 22:16:13 +0200
Subject: [PATCH 06/10] import the Zarith-inspired benchmark code

---
 Zarith/Makefile |  31 ++++++++
 Zarith/test.ml  | 195 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 226 insertions(+)
 create mode 100644 Zarith/Makefile
 create mode 100644 Zarith/test.ml

diff --git a/Zarith/Makefile b/Zarith/Makefile
new file mode 100644
index 000000000000..f9d39817f5f5
--- /dev/null
+++ b/Zarith/Makefile
@@ -0,0 +1,31 @@
+ROOTDIR=..
+OCAMLRUN=$(ROOTDIR)/runtime/ocamlrun
+STDFLAGS=-nostdlib -I $(ROOTDIR)/stdlib
+OCAMLC=$(OCAMLRUN) $(ROOTDIR)/ocamlc $(STDFLAGS)
+OCAMLOPT=$(OCAMLRUN) $(ROOTDIR)/ocamlopt $(STDFLAGS)
+
+BENCHS=wrong-int zarith unboxed boxed
+BENCH_SIZE ?= 20
+run: test.byte test.native
+	for bench in $(BENCHS); do\
+	  echo -e "\n\ntest.byte $$bench:";\
+	  time ./test.byte $$bench $(BENCH_SIZE) 1_000_000;\
+	done;\
+	for bench in $(BENCHS); do\
+	  echo -e "\n\ntest.native $$bench:";\
+	  time ./test.native $$bench $(BENCH_SIZE) 10_000_000;\
+	done
+
+ZARITH_FLAGS=-ccopt "-I$(ROOTDIR)/runtime -DHAS_GMP -O3 -Wall -Wextra" -cclib -lgmp
+
+test.cmm: test.ml
+	$(OCAMLOPT) -c test.ml -dcmm 2>test.cmm
+
+test.byte: test.ml
+	$(OCAMLC) -o test.byte -custom $(ZARITH_FLAGS) -linkall caml_z.c test.ml
+
+test.native: test.ml
+	$(OCAMLOPT) -o test.native $(ZARITH_FLAGS) caml_z.c test.ml
+
+clean:
+	rm -f test.cm* test.o caml_z.o test.byte test.native
diff --git a/Zarith/test.ml b/Zarith/test.ml
new file mode 100644
index 000000000000..9d20197a3efd
--- /dev/null
+++ b/Zarith/test.ml
@@ -0,0 +1,195 @@
+(* See 'Makefile' for building and running instructions *)
+type test =
+  | Wrong_int
+  | Zarith
+  | Boxed
+  | Unboxed
+
+
+let usage () =
+  Printf.eprintf
+    "Usage: %s [wrong-int|zarith|boxed|unboxed] <input : int> <n_iters: int >\n%!"
+    Sys.argv.(0)
+
+let impl_choice = match Sys.argv.(1) with
+  | "wrong-int" -> Wrong_int
+  | "zarith" -> Zarith
+  | "boxed" -> Boxed
+  | "unboxed" -> Unboxed
+  | _ | exception _ -> usage (); exit 2
+
+let factorial_input =
+  try int_of_string Sys.argv.(2)
+  with _ -> usage (); exit 3
+
+let n_iters =
+  try int_of_string Sys.argv.(3)
+  with _ -> usage (); exit 4
+
+(* Our benchmark computes (factorial n) in a silly way,
+   using the repeated-addition definition of multiplication:
+
+   !3 is computed as
+     let !0 = 1 in
+     let !1 = !0 in
+     let !2 = !1 + !1 in
+     let !3 = !2 + !2 + !2 in
+     !3
+
+   The number of additions is quadratic in the input n.
+
+   The result of (factorial n) exceeds OCaml's 63-bit integers for n=21.
+*)
+let factorial of_int ( +! ) n =
+  let rec ( *! ) a b =
+    if b = 0 then of_int 0
+    else a *! (b - 1) +! a
+  in
+  let rec fac n =
+    if n = 0 then of_int 1
+    else fac (n - 1) *! n
+  in
+  fac n
+
+module Wrong_int = struct
+  (* This a *wrong* implementation of bignums using integers that will overflow and wraparound.
+     It is informative to compare the benchmark performance of this implementation
+     with the correct implementations. *)
+  let of_int n = n
+  let add x y = x + y
+  let fac = factorial of_int add
+  let to_string = string_of_int
+end
+
+module Zarith = struct
+  (* This is taken verbatim from Zarith's z.ml:
+       https://github.com/ocaml/Zarith/blob/39df015/z.ml#L41-L49
+  *)
+  type t
+
+  external is_small_int: t -> bool = "%obj_is_int"
+  external unsafe_to_int: t -> int = "%identity"
+  external of_int: int -> t = "%identity"
+
+  external c_add: t -> t -> t = "ml_z_add"
+  let add_zarith x y =
+    if is_small_int x && is_small_int y then begin
+      let z = unsafe_to_int x + unsafe_to_int y in
+      (* Overflow check -- Hacker's Delight, section 2.12 *)
+      if (z lxor unsafe_to_int x) land (z lxor unsafe_to_int y) >= 0
+      then of_int z
+      else c_add x y
+    end else
+      c_add x y
+
+  (* new code *)
+  let fac n = factorial of_int add_zarith n
+
+  external c_format: string -> t -> string = "ml_z_format"
+  let to_string n = c_format "%d" n
+end
+
+module Boxed = struct
+  (* This is a naive, safe implementation of Zarith's fast-path logic,
+     using an algebraic datatype to distinguish short from long integers. *)
+  type t
+
+  external long_int: int -> t = "%identity"
+  external c_add: t -> t -> t = "ml_z_add"
+
+  type boxed =
+    | Short of int
+    | Long of t
+
+  let of_int n = Short n
+
+  let to_long = function
+    | Short x -> long_int x
+    | Long x -> x
+
+  let add_boxed a b =
+    match a, b with
+    | Short x, Short y ->
+        let z = x + y in
+        (* Overflow check -- Hacker's Delight, section 2.12 *)
+        if (z lxor x) land (z lxor y) >= 0
+        then Short z
+        else
+          (* Note: in the general case of additions, in some cases the
+             result of c_add may be a "short" integer again, and
+             should be turned back into a Short if we wanted to mimick
+             Zarith' representation invariants. The current approach
+             is correct, simpler, and strictly equivalent for this
+             benchmark as we only manipulate non-negative numbers. *)
+          Long (c_add (long_int x) (long_int y))
+    | _, _ -> Long (c_add (to_long a) (to_long b))
+
+  let fac n = factorial of_int add_boxed n
+
+  external c_format: string -> t -> string = "ml_z_format"
+  let to_string = function
+    | Short n -> string_of_int n
+    | Long n -> c_format "%d" n
+end
+
+module Unboxed = struct
+  (* This is a safe implementation of Zarith's fast-path logic,
+     using an algebraic to distinguish short from long integers,
+     unboxing the "short" constructor to get something close to Zarith's internal representation.
+
+     The only different compared to the Boxed implementation above is the [@unboxed] annotation.
+
+     Note: in theory we could also design the implementation to unbox the Long constructor
+     (provided it only stores custom blocks, never immediate integers), but we have not
+     implemented support for refining abstract type head-shapes, which would be required.
+  *)
+  type t
+
+  external long_int: int -> t = "%identity"
+  external c_add: t -> t -> t = "ml_z_add"
+
+  type boxed =
+    | Short of int [@unboxed]
+    | Long of t
+
+  let of_int n = Short n
+
+  let to_long = function
+    | Short x -> long_int x
+    | Long x -> x
+
+  let add_unboxed a b =
+    match a, b with
+    | Short x, Short y ->
+        let z = x + y in
+        (* Overflow check -- Hacker's Delight, section 2.12 *)
+        if (z lxor x) land (z lxor y) >= 0
+        then Short z
+        else Long (c_add (long_int x) (long_int y))
+    | _, _ -> Long (c_add (to_long a) (to_long b))
+
+  let fac n = factorial of_int add_unboxed n
+
+  external c_format: string -> t -> string = "ml_z_format"
+  let to_string = function
+    | Short n -> string_of_int n
+    | Long n -> c_format "%d" n
+end
+
+
+let test to_string fac =
+  for _i = 1 to n_iters - 1 do
+    ignore (fac factorial_input);
+  done;
+  print_endline (to_string (fac factorial_input))
+
+let () =
+  match impl_choice with
+  | Wrong_int ->
+      test Wrong_int.to_string Wrong_int.fac
+  | Zarith ->
+      test Zarith.to_string Zarith.fac
+  | Boxed ->
+      test Boxed.to_string Boxed.fac
+  | Unboxed ->
+      test Unboxed.to_string Unboxed.fac

From 92a7825c2b424800ba5a3ac7bed7fe8d75aa6093 Mon Sep 17 00:00:00 2001
From: Gabriel Scherer <gabriel.scherer@gmail.com>
Date: Sun, 10 Oct 2021 15:39:08 +0200
Subject: [PATCH 07/10] adapt the code to follow the paper presentation

---
 Zarith/Makefile |  4 ++++
 Zarith/caml_z.c | 22 ++++++++++++++++++++++
 Zarith/test.ml  | 33 ++++++++++++++-------------------
 3 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/Zarith/Makefile b/Zarith/Makefile
index f9d39817f5f5..3627514e4f7a 100644
--- a/Zarith/Makefile
+++ b/Zarith/Makefile
@@ -6,6 +6,7 @@ OCAMLOPT=$(OCAMLRUN) $(ROOTDIR)/ocamlopt $(STDFLAGS)
 
 BENCHS=wrong-int zarith unboxed boxed
 BENCH_SIZE ?= 20
+
 run: test.byte test.native
 	for bench in $(BENCHS); do\
 	  echo -e "\n\ntest.byte $$bench:";\
@@ -16,6 +17,9 @@ run: test.byte test.native
 	  time ./test.native $$bench $(BENCH_SIZE) 10_000_000;\
 	done
 
+run-with-overflow:
+	$(MAKE) run BENCH_SIZE=22
+
 ZARITH_FLAGS=-ccopt "-I$(ROOTDIR)/runtime -DHAS_GMP -O3 -Wall -Wextra" -cclib -lgmp
 
 test.cmm: test.ml
diff --git a/Zarith/caml_z.c b/Zarith/caml_z.c
index 63283e647b09..c7c44d3d9e27 100644
--- a/Zarith/caml_z.c
+++ b/Zarith/caml_z.c
@@ -1423,6 +1423,28 @@ CAMLprim value ml_z_add(value arg1, value arg2)
   return ml_z_addsub(arg1, arg2, 0);
 }
 
+static inline value unbox(value arg) {
+    if (Is_long(arg)) return arg;
+    else return Field(arg, 0);
+}
+
+CAMLprim value box(value arg) {
+    if (Is_long(arg))
+        return arg;
+    else {
+        CAMLparam1(arg);
+        value ret;
+        ret = caml_alloc_small(1, 0);
+        Field(ret, 0) = arg;
+        CAMLreturn(ret);
+    }
+}
+
+CAMLprim value ml_z_add_boxcustom(value arg1, value arg2)
+{
+  return box(ml_z_addsub(unbox(arg1), unbox(arg2), 0));
+}
+
 CAMLprim value ml_z_sub(value arg1, value arg2)
 {
   Z_MARK_OP;
diff --git a/Zarith/test.ml b/Zarith/test.ml
index 9d20197a3efd..dfac75a58306 100644
--- a/Zarith/test.ml
+++ b/Zarith/test.ml
@@ -92,14 +92,14 @@ end
 module Boxed = struct
   (* This is a naive, safe implementation of Zarith's fast-path logic,
      using an algebraic datatype to distinguish short from long integers. *)
-  type t
+  type gmp_t
 
-  external long_int: int -> t = "%identity"
-  external c_add: t -> t -> t = "ml_z_add"
+  external c_add: gmp_t -> gmp_t -> gmp_t = "ml_z_add"
+  external long_int : int -> gmp_t = "ml_z_of_int"
 
-  type boxed =
+  type t =
     | Short of int
-    | Long of t
+    | Long of gmp_t
 
   let of_int n = Short n
 
@@ -126,7 +126,7 @@ module Boxed = struct
 
   let fac n = factorial of_int add_boxed n
 
-  external c_format: string -> t -> string = "ml_z_format"
+  external c_format: string -> gmp_t -> string = "ml_z_format"
   let to_string = function
     | Short n -> string_of_int n
     | Long n -> c_format "%d" n
@@ -143,20 +143,15 @@ module Unboxed = struct
      (provided it only stores custom blocks, never immediate integers), but we have not
      implemented support for refining abstract type head-shapes, which would be required.
   *)
-  type t
+  type gmp_t
 
-  external long_int: int -> t = "%identity"
-  external c_add: t -> t -> t = "ml_z_add"
-
-  type boxed =
+  type t =
     | Short of int [@unboxed]
-    | Long of t
+    | Long of gmp_t
 
-  let of_int n = Short n
+  external c_add: t -> t -> t = "ml_z_add_boxcustom"
 
-  let to_long = function
-    | Short x -> long_int x
-    | Long x -> x
+  let of_int n = Short n
 
   let add_unboxed a b =
     match a, b with
@@ -165,12 +160,12 @@ module Unboxed = struct
         (* Overflow check -- Hacker's Delight, section 2.12 *)
         if (z lxor x) land (z lxor y) >= 0
         then Short z
-        else Long (c_add (long_int x) (long_int y))
-    | _, _ -> Long (c_add (to_long a) (to_long b))
+        else c_add a b
+    | _, _ -> c_add a b
 
   let fac n = factorial of_int add_unboxed n
 
-  external c_format: string -> t -> string = "ml_z_format"
+  external c_format: string -> gmp_t -> string = "ml_z_format"
   let to_string = function
     | Short n -> string_of_int n
     | Long n -> c_format "%d" n

From 63d7e10f9a8e98554b9dda5297eb8c0a1ed9f935 Mon Sep 17 00:00:00 2001
From: Gabriel Scherer <gabriel.scherer@gmail.com>
Date: Sun, 10 Oct 2021 15:39:36 +0200
Subject: [PATCH 08/10] overflow counting logic

---
 Zarith/test.ml | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/Zarith/test.ml b/Zarith/test.ml
index dfac75a58306..89a82f0ec7d8 100644
--- a/Zarith/test.ml
+++ b/Zarith/test.ml
@@ -153,15 +153,24 @@ module Unboxed = struct
 
   let of_int n = Short n
 
+  let with_overflow = ref 0
+  let total_ops = ref 0
+
+  let () =
+    at_exit @@ fun () ->
+    Printf.printf "Overflow ratio: %f%%.\n%!"
+      (100. *. float !with_overflow /. float !total_ops)
+
   let add_unboxed a b =
+    incr total_ops;
     match a, b with
     | Short x, Short y ->
         let z = x + y in
         (* Overflow check -- Hacker's Delight, section 2.12 *)
         if (z lxor x) land (z lxor y) >= 0
         then Short z
-        else c_add a b
-    | _, _ -> c_add a b
+        else (incr with_overflow; c_add a b)
+    | _, _ -> (incr with_overflow; c_add a b)
 
   let fac n = factorial of_int add_unboxed n
 

From a5944741674a6f786dcaac97cc1af7fa5cc18bf2 Mon Sep 17 00:00:00 2001
From: Gabriel Scherer <gabriel.scherer@gmail.com>
Date: Sun, 10 Oct 2021 15:39:38 +0200
Subject: [PATCH 09/10] Revert "overflow counting logic"

This reverts commit 3a09ced4e0391f745755972d947faa6080af6370.

The overflow-counting logic may change the performance profile, so it
is simpler to not have it at all in normal benchmarking mode. The
figure we computed is that with BENCH_SIZE=22, 16% of operations
overflow.
---
 Zarith/test.ml | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/Zarith/test.ml b/Zarith/test.ml
index 89a82f0ec7d8..dfac75a58306 100644
--- a/Zarith/test.ml
+++ b/Zarith/test.ml
@@ -153,24 +153,15 @@ module Unboxed = struct
 
   let of_int n = Short n
 
-  let with_overflow = ref 0
-  let total_ops = ref 0
-
-  let () =
-    at_exit @@ fun () ->
-    Printf.printf "Overflow ratio: %f%%.\n%!"
-      (100. *. float !with_overflow /. float !total_ops)
-
   let add_unboxed a b =
-    incr total_ops;
     match a, b with
     | Short x, Short y ->
         let z = x + y in
         (* Overflow check -- Hacker's Delight, section 2.12 *)
         if (z lxor x) land (z lxor y) >= 0
         then Short z
-        else (incr with_overflow; c_add a b)
-    | _, _ -> (incr with_overflow; c_add a b)
+        else c_add a b
+    | _, _ -> c_add a b
 
   let fac n = factorial of_int add_unboxed n
 

From bda7606f05d42d970e1efcdcf5ce8ca192d728f0 Mon Sep 17 00:00:00 2001
From: Gabriel Scherer <gabriel.scherer@gmail.com>
Date: Mon, 13 Jun 2022 22:08:33 +0200
Subject: [PATCH 10/10] Zarith benchmark: include an unboxed_both version that
 also unboxes Custom

---
 Zarith/Makefile |  6 ++---
 Zarith/caml_z.c |  5 +++++
 Zarith/test.ml  | 59 ++++++++++++++++++++++++++++++++++++++++---------
 3 files changed, 56 insertions(+), 14 deletions(-)

diff --git a/Zarith/Makefile b/Zarith/Makefile
index 3627514e4f7a..0c841c5b4dce 100644
--- a/Zarith/Makefile
+++ b/Zarith/Makefile
@@ -4,7 +4,7 @@ STDFLAGS=-nostdlib -I $(ROOTDIR)/stdlib
 OCAMLC=$(OCAMLRUN) $(ROOTDIR)/ocamlc $(STDFLAGS)
 OCAMLOPT=$(OCAMLRUN) $(ROOTDIR)/ocamlopt $(STDFLAGS)
 
-BENCHS=wrong-int zarith unboxed boxed
+BENCHS=wrong-int zarith unboxed unboxed_both boxed
 BENCH_SIZE ?= 20
 
 run: test.byte test.native
@@ -25,10 +25,10 @@ ZARITH_FLAGS=-ccopt "-I$(ROOTDIR)/runtime -DHAS_GMP -O3 -Wall -Wextra" -cclib -l
 test.cmm: test.ml
 	$(OCAMLOPT) -c test.ml -dcmm 2>test.cmm
 
-test.byte: test.ml
+test.byte: test.ml caml_z.c
 	$(OCAMLC) -o test.byte -custom $(ZARITH_FLAGS) -linkall caml_z.c test.ml
 
-test.native: test.ml
+test.native: test.ml caml_z.c
 	$(OCAMLOPT) -o test.native $(ZARITH_FLAGS) caml_z.c test.ml
 
 clean:
diff --git a/Zarith/caml_z.c b/Zarith/caml_z.c
index c7c44d3d9e27..797e4599c533 100644
--- a/Zarith/caml_z.c
+++ b/Zarith/caml_z.c
@@ -1445,6 +1445,11 @@ CAMLprim value ml_z_add_boxcustom(value arg1, value arg2)
   return box(ml_z_addsub(unbox(arg1), unbox(arg2), 0));
 }
 
+CAMLprim value ml_z_add_unboxcustom(value arg1, value arg2)
+{
+  return ml_z_addsub(arg1, arg2, 0);
+}
+
 CAMLprim value ml_z_sub(value arg1, value arg2)
 {
   Z_MARK_OP;
diff --git a/Zarith/test.ml b/Zarith/test.ml
index dfac75a58306..b6b1364912fa 100644
--- a/Zarith/test.ml
+++ b/Zarith/test.ml
@@ -4,19 +4,25 @@ type test =
   | Zarith
   | Boxed
   | Unboxed
+  | Unboxed_both
 
+let impls = [
+  "wrong-int", Wrong_int;
+  "zarith", Zarith;
+  "boxed", Boxed;
+  "unboxed", Unboxed;
+  "unboxed_both", Unboxed_both;
+]
 
 let usage () =
   Printf.eprintf
-    "Usage: %s [wrong-int|zarith|boxed|unboxed] <input : int> <n_iters: int >\n%!"
+    "Usage: %s [%s] <input : int> <n_iters: int >\n%!"
+    (String.concat "|" (List.map fst impls))
     Sys.argv.(0)
 
-let impl_choice = match Sys.argv.(1) with
-  | "wrong-int" -> Wrong_int
-  | "zarith" -> Zarith
-  | "boxed" -> Boxed
-  | "unboxed" -> Unboxed
-  | _ | exception _ -> usage (); exit 2
+let impl_choice = match List.assoc Sys.argv.(1) impls with
+  | v -> v
+  | exception _ -> usage (); exit 2
 
 let factorial_input =
   try int_of_string Sys.argv.(2)
@@ -138,10 +144,6 @@ module Unboxed = struct
      unboxing the "short" constructor to get something close to Zarith's internal representation.
 
      The only different compared to the Boxed implementation above is the [@unboxed] annotation.
-
-     Note: in theory we could also design the implementation to unbox the Long constructor
-     (provided it only stores custom blocks, never immediate integers), but we have not
-     implemented support for refining abstract type head-shapes, which would be required.
   *)
   type gmp_t
 
@@ -171,6 +173,39 @@ module Unboxed = struct
     | Long n -> c_format "%d" n
 end
 
+module Unboxed_both = struct
+  (* This is a safe implementation of Zarith's fast-path logic,
+     using an algebraic to distinguish short from long integers,
+     unboxing both constructors to recover Zarith's internal representation.
+  *)
+  type gmp_t [@@shape [custom]]
+
+  type t =
+    | Short of int [@unboxed]
+    | Long of gmp_t [@unboxed]
+
+  external c_add: t -> t -> t = "ml_z_add_unboxcustom"
+
+  let of_int n = Short n
+
+  let add_unboxed a b =
+    match a, b with
+    | Short x, Short y ->
+        let z = x + y in
+        (* Overflow check -- Hacker's Delight, section 2.12 *)
+        if (z lxor x) land (z lxor y) >= 0
+        then Short z
+        else c_add a b
+    | _, _ -> c_add a b
+
+  let fac n = factorial of_int add_unboxed n
+
+  external c_format: string -> gmp_t -> string = "ml_z_format"
+  let to_string = function
+    | Short n -> string_of_int n
+    | Long n -> c_format "%d" n
+end
+
 
 let test to_string fac =
   for _i = 1 to n_iters - 1 do
@@ -188,3 +223,5 @@ let () =
       test Boxed.to_string Boxed.fac
   | Unboxed ->
       test Unboxed.to_string Unboxed.fac
+  | Unboxed_both ->
+      test Unboxed_both.to_string Unboxed_both.fac