diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml
index 9003a130..fe026807 100644
--- a/.github/workflows/formatting.yml
+++ b/.github/workflows/formatting.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-latest]
+        os: [macos-latest]
 
     steps:
       - uses: actions/checkout@v5
diff --git a/include/PSLP/PSLP_API.h b/include/PSLP/PSLP_API.h
index 2f5eb30c..7afeead6 100644
--- a/include/PSLP/PSLP_API.h
+++ b/include/PSLP/PSLP_API.h
@@ -102,7 +102,7 @@ extern "C"
     } Presolver;
 
     /* The user is responsible for freeing the settings using 'free_settings'. */
-    Settings *default_settings();
+    Settings *default_settings(void);
     void free_settings(Settings *stgs);
     void set_settings_true(Settings *stgs);
     void set_settings_false(Settings *stgs);
diff --git a/include/PSLP/PSLP_stats.h b/include/PSLP/PSLP_stats.h
index 50fdf12d..aad2da00 100644
--- a/include/PSLP/PSLP_stats.h
+++ b/include/PSLP/PSLP_stats.h
@@ -47,6 +47,8 @@ extern "C"
         double time_init;
         double time_fast_reductions;
         double time_medium_reductions;
+        double time_ston_cols;
+        double time_dton_rows;
         double time_primal_propagation;
         double time_parallel_rows;
         double time_parallel_cols;
diff --git a/include/core/Activity.h b/include/core/Activity.h
index 598af0eb..64728ead 100644
--- a/include/core/Activity.h
+++ b/include/core/Activity.h
@@ -132,8 +132,7 @@ double compute_max_act_one_tag(const double *vals, const int *cols, int len,
 void update_activities_bound_change(Activity *activities, const struct Matrix *A,
                                     const struct Bound *bounds, const double *vals,
                                     const int *rows, int len, double old_bound,
-                                    double new_bound, double finite_bound,
-                                    bool lower,
+                                    double new_bound, bool finite_bound, bool lower,
                                     iVec *updated_activities HUGE_BOUND_PARAM);
 
 void update_activities_fixed_col(Activity *activities, const struct Matrix *A,
diff --git a/include/core/Constraints.h b/include/core/Constraints.h
index 996d74a0..2c523447 100644
--- a/include/core/Constraints.h
+++ b/include/core/Constraints.h
@@ -90,4 +90,8 @@ void delete_inactive_rows(Constraints *constraints);
 */
 void delete_inactive_cols_from_A_and_AT(Constraints *constraints);
 
+#ifdef TESTING
+size_t update_column_map(const int *col_sizes, int *map, size_t n_cols);
+#endif
+
 #endif // CONSTRAINTS_H
diff --git a/include/core/Debugger.h b/include/core/Debugger.h
index a6597f13..a2bcd3ce 100644
--- a/include/core/Debugger.h
+++ b/include/core/Debugger.h
@@ -28,7 +28,6 @@ struct Lock;
 struct iVec;
 struct PresolveStats;
 
-#include <assert.h>
 #include <stdbool.h>
 #include <stddef.h>
 
diff --git a/include/core/Matrix.h b/include/core/Matrix.h
index 01c0bfb1..313a923c 100644
--- a/include/core/Matrix.h
+++ b/include/core/Matrix.h
@@ -108,8 +108,8 @@ empty column of A, will be removed when this function is called on AT.
 To have consistency between A and AT we must update the column indices of A.
 This is done in the end of the function (columns[j] = colsmap[columns[j]]).
 */
-void remove_extra_space(Matrix *A, const int *row_sizes, const int *col_sizes,
-                        bool remove_all, int *col_idxs_map);
+void remove_extra_space(Matrix *A, const int *row_sizes, bool remove_all,
+                        const int *col_idxs_map, size_t new_n_cols);
 
 void print_row_starts(const RowRange *row_ranges, size_t len);
 
@@ -120,6 +120,7 @@ Matrix *random_matrix_new(size_t n_rows, size_t n_cols, double density);
 // rows; otherwise it throws an assertion
 void replace_row_A(Matrix *A, int row, double ratio, double *new_vals, int *cols_new,
                    int new_len);
+
 #endif
 
 #endif // SPARSE_LA_MATRIX_H
diff --git a/include/core/Workspace.h b/include/core/Workspace.h
index 11d62ed5..6ac73bdc 100644
--- a/include/core/Workspace.h
+++ b/include/core/Workspace.h
@@ -46,6 +46,10 @@ typedef struct Work
     // int_vec is used within parallel_rows and dual propagation
     iVec *int_vec;
 
+    // radix_aux is used as auxiliary buffer for radix sort in parallel_rows/cols
+    // TODO: do we really need this? or can we reuse space?
+    int *radix_aux;
+
     Mapping *mappings;
 } Work;
 
diff --git a/include/core/debug_macros.h b/include/core/debug_macros.h
index 4a773472..6e5fb09c 100644
--- a/include/core/debug_macros.h
+++ b/include/core/debug_macros.h
@@ -22,6 +22,21 @@
 #include "Tags.h"
 #include <stdbool.h>
 #include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/* Assertion that fires even in release mode (NDEBUG). Only use inside
+   the debugger (Debugger.c / Debugger.h) — never in production code. */
+#define PSLP_ASSERT(cond)                                                           \
+    do                                                                              \
+    {                                                                               \
+        if (!(cond))                                                                \
+        {                                                                           \
+            fprintf(stderr, "PSLP assertion failed: %s [%s:%d]\n", #cond, __FILE__, \
+                    __LINE__);                                                      \
+            abort();                                                                \
+        }                                                                           \
+    } while (0)
 
 #ifdef NDEBUG
 #define HUGE_BOUND_IS_OK
@@ -89,7 +104,7 @@ static inline bool ARRAYS_EQUAL_COLTAG(ColTag *arr1, const ColTag *arr2, int siz
     {                                                                               \
         for (int iii = 0; iii < (len); iii++)                                       \
         {                                                                           \
-            assert(!HAS_TAG(row_tags[rows[iii]], R_TAG_INACTIVE));                  \
+            PSLP_ASSERT(!HAS_TAG(row_tags[rows[iii]], R_TAG_INACTIVE));             \
         }                                                                           \
     } while (0)
 
diff --git a/include/core/radix_sort.h b/include/core/radix_sort.h
new file mode 100644
index 00000000..f4d154ca
--- /dev/null
+++ b/include/core/radix_sort.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2025 Daniel Cederberg
+ *
+ * This file is part of the PSLP project (LP Presolver).
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RADIX_SORT_H
+#define RADIX_SORT_H
+
+#include <stddef.h>
+
+// LSD radix sort on composite key (sparsity_ID, coeff_hash).
+// Sorts row indices in-place. aux must have space for n ints.
+void radix_sort_rows(int *rows, size_t n, const int *sparsity_IDs,
+                     const int *coeff_hashes, int *aux);
+
+// Parallel version: splits into 4 chunks, sorts in parallel, merges.
+// Falls back to sequential radix_sort_rows for n < 100000.
+void parallel_radix_sort_rows(int *rows, size_t n, const int *sparsity_IDs,
+                              const int *coeff_hashes, int *aux);
+
+#endif /* RADIX_SORT_H */
diff --git a/include/data_structures/Binary_search.h b/include/data_structures/Binary_search.h
new file mode 100644
index 00000000..964fd02d
--- /dev/null
+++ b/include/data_structures/Binary_search.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2025 Daniel Cederberg
+ *
+ * This file is part of the PSLP project (LP Presolver).
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CORE_BINARY_SEARCH_H
+#define CORE_BINARY_SEARCH_H
+
+#define BSEARCH_LINEAR_THRESHOLD 8
+
+/* Find the index of 'target' in a sorted array 'arr' of length 'len'.
+   Returns the relative index (0-based) if found, or -1 if not found.
+   Falls back to linear scan for short arrays. */
+static inline int sorted_find(const int *arr, int len, int target)
+{
+    if (len <= BSEARCH_LINEAR_THRESHOLD)
+    {
+        for (int i = 0; i < len; ++i)
+        {
+            if (arr[i] == target) return i;
+            if (arr[i] > target) return -1;
+        }
+        return -1;
+    }
+
+    int lo = 0, hi = len - 1;
+    while (lo <= hi)
+    {
+        int mid = lo + (hi - lo) / 2;
+        if (arr[mid] == target)
+        {
+            return mid;
+        }
+        if (arr[mid] < target)
+        {
+            lo = mid + 1;
+        }
+        else
+        {
+            hi = mid - 1;
+        }
+    }
+    return -1;
+}
+
+/* Find the first index where arr[i] >= target in a sorted array of length 'len'.
+   Returns 'len' if all elements are less than target.
+   Falls back to linear scan for short arrays. */
+static inline int sorted_lower_bound(const int *arr, int len, int target)
+{
+    if (len <= BSEARCH_LINEAR_THRESHOLD)
+    {
+        for (int i = 0; i < len; ++i)
+        {
+            if (arr[i] >= target)
+            {
+                return i;
+            }
+        }
+        return len;
+    }
+
+    int lo = 0, hi = len;
+    while (lo < hi)
+    {
+        int mid = lo + (hi - lo) / 2;
+        if (arr[mid] < target)
+        {
+            lo = mid + 1;
+        }
+        else
+        {
+            hi = mid;
+        }
+    }
+    return lo;
+}
+
+#endif // CORE_BINARY_SEARCH_H
diff --git a/include/explorers/Parallel_cols.h b/include/explorers/Parallel_cols.h
index a12a5cc9..220c1503 100644
--- a/include/explorers/Parallel_cols.h
+++ b/include/explorers/Parallel_cols.h
@@ -25,7 +25,7 @@
 struct Problem;
 
 /* This function finds parallel columns and merges them. It always
-   returns UNCHANGED. When this function finishes, the problem is
+   returns UNCHANGED or UNBNDORINFEAS. When this function finishes, the problem is
    up to date, and no extra synchronization is needed.
 */
 PresolveStatus remove_parallel_cols(struct Problem *prob);
diff --git a/include/explorers/Parallel_rows.h b/include/explorers/Parallel_rows.h
index 3350c5d7..f31ea891 100644
--- a/include/explorers/Parallel_rows.h
+++ b/include/explorers/Parallel_rows.h
@@ -55,7 +55,8 @@ void compute_supp_and_coeff_hash(const struct Matrix *A, const RowTag *rowtags,
 */
 void find_parallel_rows(const struct Matrix *A, const RowTag *r_Tags,
                         struct iVec *group_starts, int *parallel_rows,
-                        int *sparsity_IDs, int *coeff_hashes, RowTag INACTIVE_TAG);
+                        int *sparsity_IDs, int *coeff_hashes, RowTag INACTIVE_TAG,
+                        int *radix_aux);
 
 /* This function finds parallel rows and deletes the redundant ones, or declares
    the problem as infeasible if the parallel rows are not consistent. It returns
diff --git a/src/core/Activity.c b/src/core/Activity.c
index 49c7a8f9..2d6fd2b2 100644
--- a/src/core/Activity.c
+++ b/src/core/Activity.c
@@ -484,8 +484,7 @@ Altered_Activity update_act_bound_change(Activity *act, double coeff,
 void update_activities_bound_change(Activity *activities, const Matrix *A,
                                     const Bound *bounds, const double *vals,
                                     const int *rows, int len, double old_bound,
-                                    double new_bound, double finite_bound,
-                                    bool lower,
+                                    double new_bound, bool finite_bound, bool lower,
                                     iVec *updated_activities HUGE_BOUND_PARAM)
 {
     assert(!IS_HUGE(new_bound) || huge_bound_ok);
@@ -705,15 +704,22 @@ Altered_Activity update_activity_coeff_change(Activity *act, double lb, double u
 #endif
 
     // recompute from scratch if necessary
-    if (act->n_inf_min == 0 && n_inf_min_before == 1)
+    bool recompute_min = (act->n_inf_min == 0 && n_inf_min_before == 1);
+    bool recompute_max = (act->n_inf_max == 0 && n_inf_max_before == 1);
+
+    if (recompute_max && recompute_min)
     {
-        return MIN_ALTERED_RECOMPUTE;
+        assert(new_coeff == 0.0);
+        return MIN_ALTERED_RECOMPUTE | MAX_ALTERED_RECOMPUTE;
     }
-
-    if (act->n_inf_max == 0 && n_inf_max_before == 1)
+    else if (recompute_max)
     {
         return MAX_ALTERED_RECOMPUTE;
     }
+    else if (recompute_min)
+    {
+        return MIN_ALTERED_RECOMPUTE;
+    }
 
     return NO_RECOMPUTE;
 }
diff --git a/src/core/Constraints.c b/src/core/Constraints.c
index 1be6c19a..843700fb 100644
--- a/src/core/Constraints.c
+++ b/src/core/Constraints.c
@@ -302,18 +302,45 @@ static void bounds_shrink(Bound *ptr, int *map, size_t len)
     }
 }
 
+size_t update_column_map(const int *col_sizes, int *map, size_t n_cols)
+{
+    size_t col_count = 0;
+    for (size_t col = 0; col < n_cols; ++col)
+    {
+        if (col_sizes[col] == SIZE_INACTIVE_COL)
+        {
+            map[col] = -1;
+        }
+        else
+        {
+            map[col] = (int) col_count++;
+        }
+    }
+
+    return col_count;
+}
+
 void constraints_clean(Constraints *constraints, Mapping *map, bool remove_all)
 {
     const int *row_sizes = constraints->state->row_sizes;
     const int *col_sizes = constraints->state->col_sizes;
 
-    remove_extra_space(constraints->A, row_sizes, col_sizes, remove_all, map->cols);
-    remove_extra_space(constraints->AT, col_sizes, row_sizes, remove_all, map->rows);
+    size_t new_n_cols = update_column_map(col_sizes, map->cols, constraints->n);
+    remove_extra_space(constraints->A, row_sizes, remove_all, map->cols, new_n_cols);
 
-    dPtr_shrink(constraints->rhs, map->rows, constraints->m);
-    dPtr_shrink(constraints->lhs, map->rows, constraints->m);
+    size_t new_n_rows = update_column_map(row_sizes, map->rows, constraints->m);
+    (void) new_n_rows;
+
+    /* we don't need these for anything so no need to clean it */
+#if defined(TESTING) || !defined(NDEBUG)
+    remove_extra_space(constraints->AT, col_sizes, remove_all, map->rows,
+                       new_n_rows);
     rowTagPtr_shrink(constraints->row_tags, map->rows, constraints->m);
     colTagPtr_shrink(constraints->col_tags, map->cols, constraints->n);
+#endif
+
+    dPtr_shrink(constraints->rhs, map->rows, constraints->m);
+    dPtr_shrink(constraints->lhs, map->rows, constraints->m);
     bounds_shrink(constraints->bounds, map->cols, constraints->n);
     constraints->m = constraints->A->m;
     constraints->n = constraints->A->n;
diff --git a/src/core/Debugger.c b/src/core/Debugger.c
index a972ee4e..2c89f5f6 100644
--- a/src/core/Debugger.c
+++ b/src/core/Debugger.c
@@ -26,7 +26,6 @@
 #include "PSLP_stats.h"
 #include "State.h"
 #include "glbopts.h"
-#include <assert.h>
 #include <stdio.h>
 
 #ifdef __has_include
@@ -65,7 +64,7 @@ int ASSERT_NO_ZEROS_D(const double *x, size_t len)
 {
     for (size_t i = 0; i < len; ++i)
     {
-        assert(x[i] != 0);
+        PSLP_ASSERT(x[i] != 0);
     }
 
     return 0;
@@ -99,7 +98,7 @@ void verify_empty_rows(const Constraints *constraints)
     // verify that all appended rows are empty rows
     for (size_t i = 0; i < empty_rows->len; ++i)
     {
-        assert(row_sizes[empty_rows->data[i]] == 0);
+        PSLP_ASSERT(row_sizes[empty_rows->data[i]] == 0);
         lookup[empty_rows->data[i]] = 1;
     }
 
@@ -109,7 +108,7 @@ void verify_empty_rows(const Constraints *constraints)
     {
         if (row_sizes[i] == 0)
         {
-            assert(lookup[i]);
+            PSLP_ASSERT(lookup[i]);
         }
     }
 
@@ -126,7 +125,7 @@ void verify_empty_cols(const Constraints *constraints)
     // verify that all appended columns are empty columns
     for (size_t i = 0; i < empty_cols->len; ++i)
     {
-        assert(col_sizes[empty_cols->data[i]] == 0);
+        PSLP_ASSERT(col_sizes[empty_cols->data[i]] == 0);
         lookup[empty_cols->data[i]] = 1;
     }
 
@@ -136,7 +135,7 @@ void verify_empty_cols(const Constraints *constraints)
     {
         if (col_sizes[i] == 0)
         {
-            assert(lookup[i]);
+            PSLP_ASSERT(lookup[i]);
         }
     }
 
@@ -154,7 +153,7 @@ void verify_ston_cols(const Constraints *constraints)
     // empty/inactive
     for (size_t i = 0; i < ston_cols->len; ++i)
     {
-        assert(col_sizes[ston_cols->data[i]] <= 1);
+        PSLP_ASSERT(col_sizes[ston_cols->data[i]] <= 1);
         lookup[ston_cols->data[i]] = 1;
     }
 
@@ -164,7 +163,7 @@ void verify_ston_cols(const Constraints *constraints)
     {
         if (col_sizes[i] == 1)
         {
-            assert(lookup[i]);
+            PSLP_ASSERT(lookup[i]);
         }
     }
 
@@ -182,7 +181,7 @@ void verify_ston_rows(const Constraints *constraints)
     for (size_t i = 0; i < ston_rows->len; ++i)
     {
         // if (!colTag.compare(ColTag::kFixed))??
-        assert(row_sizes[ston_rows->data[i]] == 1);
+        PSLP_ASSERT(row_sizes[ston_rows->data[i]] == 1);
         lookup[ston_rows->data[i]] = 1;
     }
 
@@ -192,7 +191,7 @@ void verify_ston_rows(const Constraints *constraints)
     {
         if (row_sizes[i] == 1)
         {
-            assert(lookup[i]);
+            PSLP_ASSERT(lookup[i]);
         }
     }
 
@@ -212,7 +211,7 @@ void verify_doubleton_rows(const Constraints *constraints)
     for (size_t i = 0; i < doubleton_rows->len; ++i)
     {
         lookup[doubleton_rows->data[i]] = 1;
-        assert(row_sizes[doubleton_rows->data[i]] <= 2);
+        PSLP_ASSERT(row_sizes[doubleton_rows->data[i]] <= 2);
         bool condition = HAS_TAG(row_tags[doubleton_rows->data[i]],
                                  (R_TAG_EQ | R_TAG_INACTIVE)) ||
                          row_sizes[doubleton_rows->data[i]] < 2;
@@ -225,7 +224,7 @@ void verify_doubleton_rows(const Constraints *constraints)
                    row_sizes[doubleton_rows->data[i]]);
         }
 
-        assert(condition);
+        PSLP_ASSERT(condition);
     }
 
     // verify that all dton rows have been appended using a look-up
@@ -234,7 +233,7 @@ void verify_doubleton_rows(const Constraints *constraints)
     {
         if (row_sizes[i] == 2 && HAS_TAG(row_tags[i], R_TAG_EQ))
         {
-            assert(lookup[i]);
+            PSLP_ASSERT(lookup[i]);
         }
     }
 
@@ -253,7 +252,7 @@ void verify_no_duplicates(const iVec *vec)
     {
         for (size_t j = i + 1; j < vec->len; ++j)
         {
-            assert(vec->data[i] != vec->data[j]);
+            PSLP_ASSERT(vec->data[i] != vec->data[j]);
         }
     }
 }
@@ -271,8 +270,8 @@ void verify_no_duplicates_sort_ptr(const int *data, size_t len)
 
     for (size_t i = 0; i < len - 1; ++i)
     {
-        assert(temp[i] >= 0);
-        assert(temp[i] != temp[i + 1]);
+        PSLP_ASSERT(temp[i] >= 0);
+        PSLP_ASSERT(temp[i] != temp[i + 1]);
     }
 
     PS_FREE(temp);
@@ -289,7 +288,7 @@ void verify_no_duplicates_ptr(const int *data, size_t len)
     {
         for (size_t j = i + 1; j < len; ++j)
         {
-            assert(data[i] != data[j]);
+            PSLP_ASSERT(data[i] != data[j]);
         }
     }
 }
@@ -298,7 +297,7 @@ void verify_nonnegative_iVec(const iVec *vec)
 {
     for (size_t i = 0; i < vec->len; ++i)
     {
-        assert(vec->data[i] >= 0);
+        PSLP_ASSERT(vec->data[i] >= 0);
     }
 }
 
@@ -329,28 +328,28 @@ void verify_row_tags(const Constraints *constraints)
     {
         if (IS_NEG_INF(lhs[i]) && !HAS_TAG(row_tags[i], R_TAG_INACTIVE))
         {
-            assert(HAS_TAG(row_tags[i], R_TAG_LHS_INF));
+            PSLP_ASSERT(HAS_TAG(row_tags[i], R_TAG_LHS_INF));
         }
 
         if (IS_POS_INF(rhs[i]) && !HAS_TAG(row_tags[i], R_TAG_INACTIVE))
         {
-            assert(HAS_TAG(row_tags[i], R_TAG_RHS_INF));
+            PSLP_ASSERT(HAS_TAG(row_tags[i], R_TAG_RHS_INF));
         }
 
         if (HAS_TAG(row_tags[i], R_TAG_LHS_INF))
         {
-            assert(IS_NEG_INF(lhs[i]));
+            PSLP_ASSERT(IS_NEG_INF(lhs[i]));
         }
 
         if (HAS_TAG(row_tags[i], R_TAG_RHS_INF))
         {
-            assert(IS_POS_INF(rhs[i]));
+            PSLP_ASSERT(IS_POS_INF(rhs[i]));
         }
 
         if (!HAS_TAG(row_tags[i], R_TAG_LHS_INF) &&
             !HAS_TAG(row_tags[i], R_TAG_RHS_INF) && lhs[i] == rhs[i])
         {
-            assert(HAS_TAG(row_tags[i], (R_TAG_EQ | R_TAG_INACTIVE)));
+            PSLP_ASSERT(HAS_TAG(row_tags[i], (R_TAG_EQ | R_TAG_INACTIVE)));
         }
     }
 }
@@ -360,16 +359,17 @@ static void verify_CSR_matrix(const Matrix *A, bool compressed)
     size_t i, nnz;
     int j;
 
-    assert(A->n_alloc >= A->nnz);
+    PSLP_ASSERT(A->n_alloc >= A->nnz);
 
     if (A->nnz > 0)
     {
-        assert(A->i != NULL && A->x != NULL && A->p != NULL);
+        PSLP_ASSERT(A->i != NULL && A->x != NULL && A->p != NULL);
     }
     // verify that row ranges are ascending
     for (i = 0; i < A->m; ++i)
     {
-        assert(A->p[i + 1].start >= A->p[i].end && A->p[i].end >= A->p[i].start);
+        PSLP_ASSERT(A->p[i + 1].start >= A->p[i].end &&
+                    A->p[i].end >= A->p[i].start);
     }
 
     // verify that columns within a row are sorted
@@ -377,7 +377,7 @@ static void verify_CSR_matrix(const Matrix *A, bool compressed)
     {
         for (j = A->p[i].start + 1; j < A->p[i].end; ++j)
         {
-            assert(A->i[j] > A->i[j - 1]);
+            PSLP_ASSERT(A->i[j] > A->i[j - 1]);
         }
     }
 
@@ -387,19 +387,19 @@ static void verify_CSR_matrix(const Matrix *A, bool compressed)
     {
         for (j = A->p[i].start; j < A->p[i].end; ++j)
         {
-            assert(A->x[j] != 0);
+            PSLP_ASSERT(A->x[j] != 0);
         }
 
         nnz += (size_t) (A->p[i].end - A->p[i].start);
     }
 
-    assert(nnz == A->nnz);
+    PSLP_ASSERT(nnz == A->nnz);
 
     if (compressed)
     {
-        assert(A->p[A->m].start == A->nnz);
-        assert(A->p[A->m].end == A->nnz);
-        assert(A->p[0].start == 0);
+        PSLP_ASSERT(A->p[A->m].start == A->nnz);
+        PSLP_ASSERT(A->p[A->m].end == A->nnz);
+        PSLP_ASSERT(A->p[0].start == 0);
     }
 }
 
@@ -409,9 +409,9 @@ bool verify_A_and_AT_consistency(const Matrix *A, const Matrix *AT)
     Matrix *real_AT = transpose(A, work_n_cols);
 
     // check that nnz and dimensions are consistent
-    assert(real_AT->m == AT->m);
-    assert(real_AT->n == AT->n);
-    assert(real_AT->nnz == AT->nnz);
+    PSLP_ASSERT(real_AT->m == AT->m);
+    PSLP_ASSERT(real_AT->n == AT->n);
+    PSLP_ASSERT(real_AT->nnz == AT->nnz);
 
     size_t i;
     int j, k, start_AT, end_AT, start_real_AT, end_real_AT;
@@ -424,11 +424,11 @@ bool verify_A_and_AT_consistency(const Matrix *A, const Matrix *AT)
         start_real_AT = real_AT->p[i].start;
         end_real_AT = real_AT->p[i].end;
 
-        assert(end_AT - start_AT == end_real_AT - start_real_AT);
+        PSLP_ASSERT(end_AT - start_AT == end_real_AT - start_real_AT);
 
         for (j = start_AT, k = start_real_AT; j < end_AT; ++j, ++k)
         {
-            assert(!IS_ABS_INF(AT->x[j]) && !IS_ABS_INF(real_AT->x[k]));
+            PSLP_ASSERT(!IS_ABS_INF(AT->x[j]) && !IS_ABS_INF(real_AT->x[k]));
             if (AT->x[j] != real_AT->x[k] || AT->i[j] != real_AT->i[k])
             {
                 PS_FREE(work_n_cols);
@@ -454,7 +454,7 @@ static void verify_no_inactive_cols(const Matrix *A, ColTag *col_tags)
 
         for (j = start; j < end; ++j)
         {
-            assert(!HAS_TAG(col_tags[A->i[j]], C_TAG_INACTIVE));
+            PSLP_ASSERT(!HAS_TAG(col_tags[A->i[j]], C_TAG_INACTIVE));
         }
     }
 }
@@ -469,7 +469,7 @@ void verify_A_and_AT(const Constraints *constraints, bool compressed)
     verify_CSR_matrix(AT, compressed);
 
     // verify that A and AT are consistent
-    assert(verify_A_and_AT_consistency(A, AT));
+    PSLP_ASSERT(verify_A_and_AT_consistency(A, AT));
 
     // verify that A does not store the coefficients of inactive columns
     verify_no_inactive_cols(A, constraints->col_tags);
@@ -514,27 +514,27 @@ void verify_locks(const Matrix *AT, const Lock *locks, const ColTag *col_tags,
             }
         }
 
-        assert(up == locks[col].up);
-        assert(down == locks[col].down);
+        PSLP_ASSERT(up == locks[col].up);
+        PSLP_ASSERT(down == locks[col].down);
     }
 }
 
 void verify_empty_when_finished(const Constraints *constraints)
 {
-    assert(constraints->state->empty_cols->len == 0);
-    assert(constraints->state->empty_rows->len == 0);
-    assert(constraints->state->rows_to_delete->len == 0);
-    assert(constraints->state->fixed_cols_to_delete->len == 0);
-    assert(constraints->state->sub_cols_to_delete->len == 0);
+    PSLP_ASSERT(constraints->state->empty_cols->len == 0);
+    PSLP_ASSERT(constraints->state->empty_rows->len == 0);
+    PSLP_ASSERT(constraints->state->rows_to_delete->len == 0);
+    PSLP_ASSERT(constraints->state->fixed_cols_to_delete->len == 0);
+    PSLP_ASSERT(constraints->state->sub_cols_to_delete->len == 0);
 
     for (int i = 0; i < constraints->A->m; ++i)
     {
-        assert(!HAS_TAG(constraints->row_tags[i], R_TAG_INACTIVE));
+        PSLP_ASSERT(!HAS_TAG(constraints->row_tags[i], R_TAG_INACTIVE));
     }
 
     for (int i = 0; i < constraints->A->n; ++i)
     {
-        assert(!HAS_TAG(constraints->col_tags[i], C_TAG_INACTIVE));
+        PSLP_ASSERT(!HAS_TAG(constraints->col_tags[i], C_TAG_INACTIVE));
     }
 }
 
@@ -553,17 +553,17 @@ void verify_row_and_col_sizes(const Constraints *constraints)
     {
         if (HAS_TAG(row_tags[i], R_TAG_INACTIVE))
         {
-            assert(row_sizes[i] == SIZE_INACTIVE_ROW);
-            assert(A->p[i].start == A->p[i].end);
+            PSLP_ASSERT(row_sizes[i] == SIZE_INACTIVE_ROW);
+            PSLP_ASSERT(A->p[i].start == A->p[i].end);
         }
         else
         {
-            assert(row_sizes[i] == A->p[i].end - A->p[i].start);
+            PSLP_ASSERT(row_sizes[i] == A->p[i].end - A->p[i].start);
         }
 
         if (row_sizes[i] == SIZE_INACTIVE_ROW)
         {
-            assert(HAS_TAG(row_tags[i], R_TAG_INACTIVE));
+            PSLP_ASSERT(HAS_TAG(row_tags[i], R_TAG_INACTIVE));
         }
     }
 
@@ -577,22 +577,22 @@ void verify_row_and_col_sizes(const Constraints *constraints)
                 printf("col_sizes[%d] = %d\n", i, col_sizes[i]);
             }
 
-            assert(col_sizes[i] == SIZE_INACTIVE_COL);
+            PSLP_ASSERT(col_sizes[i] == SIZE_INACTIVE_COL);
             if (AT->p[i].start != AT->p[i].end)
             {
                 printf("AT->p[%d].start = %d\n", i, AT->p[i].start);
                 printf("AT->p[%d].end = %d\n", i, AT->p[i].end);
             }
-            assert(AT->p[i].start == AT->p[i].end);
+            PSLP_ASSERT(AT->p[i].start == AT->p[i].end);
         }
         else
         {
-            assert(col_sizes[i] == AT->p[i].end - AT->p[i].start);
+            PSLP_ASSERT(col_sizes[i] == AT->p[i].end - AT->p[i].start);
         }
 
         if (col_sizes[i] == SIZE_INACTIVE_COL)
         {
-            assert(HAS_TAG(col_tags[i], C_TAG_INACTIVE));
+            PSLP_ASSERT(HAS_TAG(col_tags[i], C_TAG_INACTIVE));
         }
     }
 }
@@ -646,8 +646,8 @@ void verify_activity(const ColTag *col_tags, const Bound *bounds, Activity activ
         }
     }
 
-    assert(n_inf_max == activity.n_inf_max);
-    assert(n_inf_min == activity.n_inf_min);
+    PSLP_ASSERT(n_inf_max == activity.n_inf_max);
+    PSLP_ASSERT(n_inf_min == activity.n_inf_min);
 
     // if there is a max infinite contribution, the max activity should be
     // INVALID_ACT_DEBUG
@@ -658,7 +658,7 @@ void verify_activity(const ColTag *col_tags, const Bound *bounds, Activity activ
             printf("n_inf_max = %d\n", n_inf_max);
             printf("activity.max = %f\n", activity.max);
         }
-        assert(activity.max == INVALID_ACT_DEBUG);
+        PSLP_ASSERT(activity.max == INVALID_ACT_DEBUG);
     }
     else
     {
@@ -673,24 +673,25 @@ void verify_activity(const ColTag *col_tags, const Bound *bounds, Activity activ
 
             if (vals[i] > 0)
             {
-                assert(!HAS_TAG(col_tags[col], C_TAG_UB_INF) &&
-                       !IS_POS_INF(bounds[col].ub));
+                PSLP_ASSERT(!HAS_TAG(col_tags[col], C_TAG_UB_INF) &&
+                            !IS_POS_INF(bounds[col].ub));
                 max += vals[i] * bounds[col].ub;
             }
             else
             {
-                assert(!HAS_TAG(col_tags[col], C_TAG_LB_INF) &&
-                       !IS_NEG_INF(bounds[col].lb));
+                PSLP_ASSERT(!HAS_TAG(col_tags[col], C_TAG_LB_INF) &&
+                            !IS_NEG_INF(bounds[col].lb));
                 max += vals[i] * bounds[col].lb;
             }
         }
+        PSLP_ASSERT(ABS(activity.max - max) < 1e-6);
     }
 
     // if there is a min infinite contribution, the min activity should be
     // INVALID_ACT_DEBUG
     if (n_inf_min > 0)
     {
-        assert(activity.min == INVALID_ACT_DEBUG);
+        PSLP_ASSERT(activity.min == INVALID_ACT_DEBUG);
     }
     else
     {
@@ -705,17 +706,19 @@ void verify_activity(const ColTag *col_tags, const Bound *bounds, Activity activ
 
             if (vals[i] > 0)
             {
-                assert(!HAS_TAG(col_tags[col], C_TAG_LB_INF) &&
-                       !IS_NEG_INF(bounds[col].lb));
+                PSLP_ASSERT(!HAS_TAG(col_tags[col], C_TAG_LB_INF) &&
+                            !IS_NEG_INF(bounds[col].lb));
                 min += vals[i] * bounds[col].lb;
             }
             else
             {
-                assert(!HAS_TAG(col_tags[col], C_TAG_UB_INF) &&
-                       !IS_POS_INF(bounds[col].ub));
+                PSLP_ASSERT(!HAS_TAG(col_tags[col], C_TAG_UB_INF) &&
+                            !IS_POS_INF(bounds[col].ub));
                 min += vals[i] * bounds[col].ub;
             }
         }
+
+        PSLP_ASSERT(ABS(activity.min - min) < 1e-6);
     }
 }
 
@@ -741,8 +744,8 @@ void ASSERT_NO_ACTIVE_STON_ROWS(const Matrix *A, const RowTag *row_tags)
 {
     for (int i = 0; i < A->m; ++i)
     {
-        assert(A->p[i].end - A->p[i].start != 1 ||
-               HAS_TAG(row_tags[i], R_TAG_INACTIVE));
+        PSLP_ASSERT(A->p[i].end - A->p[i].start != 1 ||
+                    HAS_TAG(row_tags[i], R_TAG_INACTIVE));
     }
 }
 
@@ -750,7 +753,7 @@ int ASSERT_INCREASING_I(const int *x, size_t len)
 {
     for (size_t i = 1; i < len; ++i)
     {
-        assert(x[i] > x[i - 1]);
+        PSLP_ASSERT(x[i] > x[i - 1]);
     }
 
     return 0;
@@ -780,9 +783,9 @@ void print_matrix(const Matrix *A)
 
 void verify_problem_up_to_date(const Constraints *constraints)
 {
-    assert(constraints->state->rows_to_delete->len == 0);
-    assert(constraints->state->fixed_cols_to_delete->len == 0);
-    assert(constraints->state->sub_cols_to_delete->len == 0);
+    PSLP_ASSERT(constraints->state->rows_to_delete->len == 0);
+    PSLP_ASSERT(constraints->state->fixed_cols_to_delete->len == 0);
+    PSLP_ASSERT(constraints->state->sub_cols_to_delete->len == 0);
 }
 
 void verify_row_states(const Activity *acts, const iVec *updated_activities)
@@ -790,8 +793,8 @@ void verify_row_states(const Activity *acts, const iVec *updated_activities)
     for (int i = 0; i < updated_activities->len; ++i)
     {
         int row = updated_activities->data[i];
-        assert(acts[row].status == ADDED);
-        assert(acts[row].n_inf_min == 0 || acts[row].n_inf_max == 0);
+        PSLP_ASSERT(acts[row].status == ADDED);
+        PSLP_ASSERT(acts[row].n_inf_min == 0 || acts[row].n_inf_max == 0);
     }
 }
 
@@ -802,12 +805,13 @@ void run_debugger_stats_consistency_check(const PresolveStats *stats)
                            stats->nnz_removed_parallel_rows +
                            stats->nnz_removed_parallel_cols;
 
-    assert(stats->nnz_original - stats->nnz_reduced == total_removed);
+    PSLP_ASSERT(stats->nnz_original - stats->nnz_reduced == total_removed);
 
     // valgrind timing is not reliable
 #ifndef RUNNING_ON_VALGRIND
     double time_medium = stats->ps_time_primal_propagation +
                          stats->ps_time_parallel_rows + stats->ps_time_parallel_cols;
-    assert((stats->ps_time_medium - time_medium) / MAX(1e-2, time_medium) < 0.05);
+    PSLP_ASSERT((stats->ps_time_medium - time_medium) / MAX(1e-2, time_medium) <
+                0.05);
 #endif
 }
diff --git a/src/core/Matrix.c b/src/core/Matrix.c
index 3736b024..35d52bbf 100644
--- a/src/core/Matrix.c
+++ b/src/core/Matrix.c
@@ -17,6 +17,7 @@
  */
 
 #include "Matrix.h"
+#include "Binary_search.h"
 #include "Debugger.h"
 #include "Memory_wrapper.h"
 #include "Numerics.h"
@@ -198,14 +199,14 @@ void free_matrix(Matrix *A)
     PS_FREE(A);
 }
 
-void remove_extra_space(Matrix *A, const int *row_sizes, const int *col_sizes,
-                        bool remove_all, int *col_idxs_map)
+void remove_extra_space(Matrix *A, const int *row_sizes, bool remove_all,
+                        const int *col_idxs_map, size_t new_n_cols)
 {
     int j, start, end, len, row_alloc, curr;
     int extra_row_space = (remove_all) ? 0 : EXTRA_ROW_SPACE;
     double extra_mem_ratio = (remove_all) ? 1.0 : EXTRA_MEMORY_RATIO;
     curr = 0;
-    size_t i, n_deleted_rows, col_count;
+    size_t i, n_deleted_rows;
 
     // --------------------------------------------------------------------------
     // loop through the rows and remove redundant space, including inactive
@@ -240,26 +241,10 @@ void remove_extra_space(Matrix *A, const int *row_sizes, const int *col_sizes,
     A->i = (int *) ps_realloc(A->i, (size_t) MAX(curr, 1), sizeof(int));
     A->p = (RowRange *) ps_realloc(A->p, (size_t) (A->m + 1), sizeof(RowRange));
 
-    // -------------------------------------------------------------------------
-    //                      compute new column indices
-    // -------------------------------------------------------------------------
-    col_count = 0;
-    for (i = 0; i < A->n; ++i)
-    {
-        if (col_sizes[i] == SIZE_INACTIVE_COL)
-        {
-            col_idxs_map[i] = -1;
-        }
-        else
-        {
-            col_idxs_map[i] = (int) (col_count++);
-        }
-    }
-    A->n = col_count;
-
     // -------------------------------------------------------------------------
     //                        update column indices
     // -------------------------------------------------------------------------
+    A->n = new_n_cols;
     for (i = 0; i < A->m; ++i)
     {
         for (j = A->p[i].start; j < A->p[i].end; ++j)
@@ -432,23 +417,15 @@ void print_row_starts(const RowRange *row_ranges, size_t len)
 
 double insert_or_update_coeff(Matrix *A, int row, int col, double val, int *row_size)
 {
-    int i, start, end, insertion;
     double old_val = 0.0;
-    start = A->p[row].start;
-    end = A->p[row].end;
-    insertion = end;
+    int start = A->p[row].start;
+    int end = A->p[row].end;
 
     // -----------------------------------------------------------------
     //             find where it should be inserted
     // -----------------------------------------------------------------
-    for (i = start; i < end; ++i)
-    {
-        if (A->i[i] >= col)
-        {
-            insertion = i;
-            break;
-        }
-    }
+    int rel_ins = sorted_lower_bound(A->i + start, end - start, col);
+    int insertion = start + rel_ins;
 
     // -----------------------------------------------------------------
     // Insert the new value if it is nonzero. If it exists or should be
diff --git a/src/core/Presolver.c b/src/core/Presolver.c
index 5adc344d..4cbfd7d6 100644
--- a/src/core/Presolver.c
+++ b/src/core/Presolver.c
@@ -74,7 +74,7 @@ void free_settings(Settings *stgs)
     PS_FREE(stgs);
 }
 
-Settings *default_settings()
+Settings *default_settings(void)
 {
     Settings *stgs = (Settings *) ps_malloc(1, sizeof(Settings));
     RETURN_PTR_IF_NULL(stgs, NULL);
@@ -84,9 +84,8 @@ Settings *default_settings()
     stgs->parallel_cols = true;
     stgs->primal_propagation = true;
     stgs->dual_fix = true;
-    // stgs->clean_small_coeff = false;
     stgs->finite_bound_tightening = true;
-    stgs->relax_bounds = true;
+    stgs->relax_bounds = false;
     stgs->max_shift = 10;
     stgs->max_time = 60.0;
     stgs->verbose = true;
@@ -151,7 +150,6 @@ void set_settings_true(Settings *stgs)
     stgs->parallel_cols = true;
     stgs->primal_propagation = true;
     stgs->dual_fix = true;
-    // stgs->clean_small_coeff = true;
     stgs->finite_bound_tightening = true;
     stgs->relax_bounds = true;
     stgs->verbose = true;
@@ -165,7 +163,6 @@ void set_settings_false(Settings *stgs)
     stgs->parallel_cols = false;
     stgs->primal_propagation = false;
     stgs->dual_fix = false;
-    // stgs->clean_small_coeff = false;
     stgs->finite_bound_tightening = false;
     stgs->relax_bounds = false;
     stgs->verbose = false;
@@ -263,12 +260,6 @@ Presolver *new_presolver(const double *Ax, const int *Ai, const int *Ap, size_t
     A = matrix_new_no_extra_space(Ax, Ai, Ap, n_rows, n_cols, nnz);
     if (!A) goto cleanup;
 
-    // commented out for now because does not seem helpful
-    // if (stgs->clean_small_coeff)
-    // {
-    //     clean_small_coeff_A(A, bounds, row_tags, col_tags, rhs_copy, lhs_copy);
-    // }
-
     ps_thread_t thread_id;
     ParallelInitData parallel_data = {A,    work,     n_cols,   n_rows,   lbs,
                                       ubs,  lhs_copy, rhs_copy, bounds,   col_tags,
@@ -314,7 +305,6 @@ Presolver *new_presolver(const double *Ax, const int *Ai, const int *Ap, size_t
     presolver->stgs = stgs;
     presolver->prob = new_problem(constraints, obj);
     presolver->stats = init_stats(A->m, A->n, nnz);
-    // presolver->stats->nnz_removed_trivial = nnz - A->nnz; due to clean_small_coeff
     presolver->reduced_prob =
         (PresolvedProblem *) ps_calloc(1, sizeof(PresolvedProblem));
     DEBUG(run_debugger(constraints, false));
@@ -461,16 +451,21 @@ static inline PresolveStatus run_trivial_explorers(Problem *prob,
    rows, singleton columns, and simple dual fix. It returns UNCHANGED even if
    the problem was reduced (provided that the problem does not seem infeasible or
    bounded). */
-static inline PresolveStatus run_fast_explorers(Problem *prob, const Settings *stgs)
+static inline PresolveStatus run_fast_explorers(Problem *prob, const Settings *stgs,
+                                                PresolveStats *stats)
 {
     assert(prob->constraints->state->ston_rows->len == 0);
     assert(prob->constraints->state->empty_rows->len == 0);
     assert(prob->constraints->state->empty_cols->len == 0);
     PresolveStatus status = UNCHANGED;
+    Timer timer;
 
     if (stgs->ston_cols)
     {
+        clock_gettime(CLOCK_MONOTONIC, &timer.start);
         status |= remove_ston_cols(prob);
+        clock_gettime(CLOCK_MONOTONIC, &timer.end);
+        stats->time_ston_cols += GET_ELAPSED_SECONDS(timer);
 
         // after removing singleton columns, there can be new empty columns
         // and singleton rows, but no empty rows
@@ -480,7 +475,10 @@ static inline PresolveStatus run_fast_explorers(Problem *prob, const Settings *s
 
     if (stgs->dton_eq)
     {
+        clock_gettime(CLOCK_MONOTONIC, &timer.start);
         status |= remove_dton_eq_rows(prob, stgs->max_shift);
+        clock_gettime(CLOCK_MONOTONIC, &timer.end);
+        stats->time_dton_rows += GET_ELAPSED_SECONDS(timer);
         // after removing doubleton equality rows, there can be new empty rows,
         // new singleton rows, and new empty columns
         status |= run_trivial_explorers(prob, stgs);
@@ -505,66 +503,57 @@ run_medium_explorers(Problem *prob, const Settings *stgs, PresolveStats *stats)
 
     if (stgs->primal_propagation)
     {
-        // stats
-        clock_gettime(CLOCK_MONOTONIC, &timer.start);
         nnz_before = *nnz;
 
         // the actual reduction
+        clock_gettime(CLOCK_MONOTONIC, &timer.start);
         status |= check_activities(prob);
         status |= propagate_primal(prob, stgs->finite_bound_tightening);
+        clock_gettime(CLOCK_MONOTONIC, &timer.end);
+        stats->time_primal_propagation += GET_ELAPSED_SECONDS(timer);
 
         // after dom prop propagation there can be new empty and ston rows
         status |= run_trivial_explorers(prob, stgs);
         assert(!(status & REDUCED));
         RETURN_IF_NOT_UNCHANGED(status);
-
-        // stats
         stats->nnz_removed_primal_propagation += nnz_before - *nnz;
-        clock_gettime(CLOCK_MONOTONIC, &timer.end);
-        stats->time_primal_propagation += GET_ELAPSED_SECONDS(timer);
     }
 
     if (stgs->parallel_rows)
     {
-        // stats
         nnz_before = *nnz;
-        clock_gettime(CLOCK_MONOTONIC, &timer.start);
 
         // the actual reduction (after removing parallel rows there can
         // be no new empty or ston rows so no need to run trivial presolvers)
+        clock_gettime(CLOCK_MONOTONIC, &timer.start);
         status |= remove_parallel_rows(prob->constraints);
+        clock_gettime(CLOCK_MONOTONIC, &timer.end);
+        stats->time_parallel_rows += GET_ELAPSED_SECONDS(timer);
         assert(prob->constraints->state->empty_rows->len == 0);
         assert(prob->constraints->state->ston_rows->len == 0);
         assert(!(status & REDUCED));
         RETURN_IF_NOT_UNCHANGED(status);
-
-        // stats
         stats->nnz_removed_parallel_rows += nnz_before - *nnz;
-        clock_gettime(CLOCK_MONOTONIC, &timer.end);
-        stats->time_parallel_rows += GET_ELAPSED_SECONDS(timer);
     }
 
     if (stgs->parallel_cols)
     {
-        // stats
         nnz_before = *nnz;
-        clock_gettime(CLOCK_MONOTONIC, &timer.start);
 
         // the actual reduction (after removing parallel columns there can
         // be no new empty rows or empty cols but might be new stonrows, probably
-        // although that's rare but it does happen
+        // although that's rare but it does happen)
+        clock_gettime(CLOCK_MONOTONIC, &timer.start);
         status |= remove_parallel_cols(prob);
+        clock_gettime(CLOCK_MONOTONIC, &timer.end);
+        stats->time_parallel_cols += GET_ELAPSED_SECONDS(timer);
         assert(prob->constraints->state->empty_rows->len == 0);
         assert(prob->constraints->state->empty_cols->len == 0);
         status |= run_trivial_explorers(prob, stgs);
         assert(prob->constraints->state->ston_rows->len == 0);
         assert(!(status & REDUCED));
         RETURN_IF_NOT_UNCHANGED(status);
-
-        // stats
         stats->nnz_removed_parallel_cols += nnz_before - *nnz;
-        clock_gettime(CLOCK_MONOTONIC, &timer.end);
-        stats->time_parallel_cols += GET_ELAPSED_SECONDS(timer);
     }
 
     return status;
@@ -701,7 +690,7 @@ PresolveStatus run_presolver(Presolver *presolver)
         {
             nnz_before_reduction = A->nnz;
             RUN_AND_TIME(run_fast_explorers, inner_timer,
-                         stats->time_fast_reductions, status, prob, stgs);
+                         stats->time_fast_reductions, status, prob, stgs, stats);
             stats->nnz_removed_fast += nnz_before_reduction - A->nnz;
         }
         else if (curr_complexity == MEDIUM)
diff --git a/src/core/Workspace.c b/src/core/Workspace.c
index ffd3d46b..3156a715 100644
--- a/src/core/Workspace.c
+++ b/src/core/Workspace.c
@@ -35,6 +35,7 @@ Work *new_work(size_t n_rows, size_t n_cols)
     work->iwork2_max_nrows_ncols =
         (int *) ps_calloc(MAX(n_rows, n_cols), sizeof(int));
     work->int_vec = iVec_new(INT_VEC_INITIALIZATION);
+    work->radix_aux = (int *) ps_calloc(MAX(n_rows, n_cols), sizeof(int));
     work->mappings = (Mapping *) ps_malloc(1, sizeof(Mapping));
     work->mappings->cols = (int *) ps_malloc(n_cols, sizeof(int));
     work->mappings->rows = (int *) ps_malloc(n_rows, sizeof(int));
@@ -42,7 +43,7 @@ Work *new_work(size_t n_rows, size_t n_cols)
     if (!work->iwork_n_cols || !work->iwork_n_rows ||
         !work->iwork1_max_nrows_ncols || !work->iwork2_max_nrows_ncols ||
         !work->int_vec || !work->mappings || !work->mappings->cols ||
-        !work->mappings->rows)
+        !work->mappings->rows || !work->radix_aux)
     {
         free_work(work);
         return NULL;
@@ -66,5 +67,6 @@ void free_work(Work *work)
     PS_FREE(work->mappings->cols);
     PS_FREE(work->mappings->rows);
     PS_FREE(work->mappings);
+    PS_FREE(work->radix_aux);
     PS_FREE(work);
 }
diff --git a/src/core/radix_sort.c b/src/core/radix_sort.c
new file mode 100644
index 00000000..ee81e4bb
--- /dev/null
+++ b/src/core/radix_sort.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright 2025 Daniel Cederberg
+ *
+ * This file is part of the PSLP project (LP Presolver).
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "radix_sort.h"
+#include "pslp_thread.h"
+#include <stdint.h>
+#include <string.h>
+
+#define NUM_SORT_THREADS 4
+#define PARALLEL_SORT_THRESHOLD 100000
+
+static void insertion_sort_rows(int *rows, size_t n, const int *sparsity_IDs,
+                                const int *coeff_hashes)
+{
+    for (size_t i = 1; i < n; i++)
+    {
+        int key = rows[i];
+        int key_sp = sparsity_IDs[key];
+        int key_ch = coeff_hashes[key];
+        size_t j = i;
+        while (j > 0)
+        {
+            int prev_sp = sparsity_IDs[rows[j - 1]];
+            int prev_ch = coeff_hashes[rows[j - 1]];
+            if (prev_sp < key_sp || (prev_sp == key_sp && prev_ch < key_ch))
+            {
+                break;
+            }
+            rows[j] = rows[j - 1];
+            j--;
+        }
+        rows[j] = key;
+    }
+}
+
+void radix_sort_rows(int *rows, size_t n, const int *sparsity_IDs,
+                     const int *coeff_hashes, int *aux)
+{
+    if (n < 256)
+    {
+        insertion_sort_rows(rows, n, sparsity_IDs, coeff_hashes);
+        return;
+    }
+
+    size_t counts[256];
+    int *src = rows, *dst = aux;
+
+    // Phase 1: 4 passes on coeff_hashes (secondary key, sorted first in LSD)
+    // Phase 2: 4 passes on sparsity_IDs (primary key, sorted last in LSD)
+    for (int phase = 0; phase < 2; phase++)
+    {
+        const int *keys = (phase == 0) ? coeff_hashes : sparsity_IDs;
+
+        for (int pass = 0; pass < 4; pass++)
+        {
+            int shift = pass * 8;
+
+            // histogram
+            memset(counts, 0, 256 * sizeof(size_t));
+            for (size_t i = 0; i < n; i++)
+            {
+                unsigned byte = ((uint32_t) keys[src[i]] >> shift) & 0xFF;
+                counts[byte]++;
+            }
+
+            // skip pass if all values fall in one bucket (never skip pass 0)
+            if (!(phase == 0 && pass == 0))
+            {
+                int skip = 0;
+                for (int b = 0; b < 256; b++)
+                {
+                    if (counts[b] == n)
+                    {
+                        skip = 1;
+                        break;
+                    }
+                }
+                if (skip)
+                {
+                    continue;
+                }
+            }
+
+            // prefix sum
+            size_t total = 0;
+            for (int b = 0; b < 256; b++)
+            {
+                size_t c = counts[b];
+                counts[b] = total;
+                total += c;
+            }
+
+            // scatter: backward for pass 0 (reverse stability), forward otherwise
+            if (phase == 0 && pass == 0)
+            {
+                // backward scatter reverses equal-byte elements
+                for (size_t i = n; i > 0; i--)
+                {
+                    unsigned byte = ((uint32_t) keys[src[i - 1]] >> shift) & 0xFF;
+                    dst[counts[byte]++] = src[i - 1];
+                }
+            }
+            else
+            {
+                for (size_t i = 0; i < n; i++)
+                {
+                    unsigned byte = ((uint32_t) keys[src[i]] >> shift) & 0xFF;
+                    dst[counts[byte]++] = src[i];
+                }
+            }
+
+            // swap src and dst
+            int *tmp = src;
+            src = dst;
+            dst = tmp;
+        }
+    }
+
+    // if result ended up in aux, copy back to rows
+    if (src != rows)
+    {
+        memcpy(rows, src, n * sizeof(int));
+    }
+}
+
+// --- Parallel radix sort infrastructure ---
+
+typedef struct
+{
+    int *rows;
+    size_t n;
+    const int *sparsity_IDs;
+    const int *coeff_hashes;
+    int *aux;
+} SortChunkArg;
+
+static void *sort_chunk_func(void *arg)
+{
+    SortChunkArg *a = (SortChunkArg *) arg;
+    radix_sort_rows(a->rows, a->n, a->sparsity_IDs, a->coeff_hashes, a->aux);
+    return NULL;
+}
+
+// Compare two row indices by (sparsity_ID, coeff_hash) as uint32_t.
+// Returns <0 if a comes first, >0 if b comes first, 0 if equal.
+static inline int merge_compare(int a, int b, const int *sparsity_IDs,
+                                const int *coeff_hashes)
+{
+    uint32_t sa = (uint32_t) sparsity_IDs[a];
+    uint32_t sb = (uint32_t) sparsity_IDs[b];
+    if (sa != sb) return (sa < sb) ? -1 : 1;
+    uint32_t ca = (uint32_t) coeff_hashes[a];
+    uint32_t cb = (uint32_t) coeff_hashes[b];
+    if (ca != cb) return (ca < cb) ? -1 : 1;
+    return 0;
+}
+
+// 4-way merge of sorted chunks into dst.
+static void merge_4way(int *dst, int *chunk_ptrs[NUM_SORT_THREADS],
+                       size_t chunk_sizes[NUM_SORT_THREADS], const int *sparsity_IDs,
+                       const int *coeff_hashes, size_t total_n)
+{
+    size_t pos[NUM_SORT_THREADS] = {0, 0, 0, 0};
+
+    for (size_t out = 0; out < total_n; out++)
+    {
+        int best = -1;
+        for (int k = 0; k < NUM_SORT_THREADS; k++)
+        {
+            if (pos[k] >= chunk_sizes[k]) continue;
+            if (best < 0 ||
+                merge_compare(chunk_ptrs[k][pos[k]], chunk_ptrs[best][pos[best]],
+                              sparsity_IDs, coeff_hashes) <= 0)
+            {
+                best = k;
+            }
+        }
+        dst[out] = chunk_ptrs[best][pos[best]++];
+    }
+}
+
+void parallel_radix_sort_rows(int *rows, size_t n, const int *sparsity_IDs,
+                              const int *coeff_hashes, int *aux)
+{
+    if (n < PARALLEL_SORT_THRESHOLD)
+    {
+        radix_sort_rows(rows, n, sparsity_IDs, coeff_hashes, aux);
+        return;
+    }
+
+    // Divide into NUM_SORT_THREADS chunks.
+    // First (n % NUM_SORT_THREADS) chunks get one extra element.
+    size_t base = n / NUM_SORT_THREADS;
+    size_t extra = n % NUM_SORT_THREADS;
+
+    SortChunkArg args[NUM_SORT_THREADS];
+    size_t chunk_sizes[NUM_SORT_THREADS];
+    int *chunk_ptrs[NUM_SORT_THREADS];
+    size_t offset = 0;
+
+    for (int k = 0; k < NUM_SORT_THREADS; k++)
+    {
+        chunk_sizes[k] = base + (k < (int) extra ? 1 : 0);
+        chunk_ptrs[k] = rows + offset;
+        args[k].rows = rows + offset;
+        args[k].n = chunk_sizes[k];
+        args[k].sparsity_IDs = sparsity_IDs;
+        args[k].coeff_hashes = coeff_hashes;
+        args[k].aux = aux + offset;
+        offset += chunk_sizes[k];
+    }
+
+    // Launch worker threads for chunks 1..3, main thread sorts chunk 0.
+    ps_thread_t threads[NUM_SORT_THREADS - 1];
+    for (int k = 1; k < NUM_SORT_THREADS; k++)
+    {
+        ps_thread_create(&threads[k - 1], NULL, sort_chunk_func, &args[k]);
+    }
+    sort_chunk_func(&args[0]);
+
+    for (int k = 1; k < NUM_SORT_THREADS; k++)
+    {
+        ps_thread_join(&threads[k - 1], NULL);
+    }
+
+    // 4-way merge into aux, then copy back.
+    merge_4way(aux, chunk_ptrs, chunk_sizes, sparsity_IDs, coeff_hashes, n);
+    memcpy(rows, aux, n * sizeof(int));
+}
diff --git a/src/explorers/DtonsEq.c b/src/explorers/DtonsEq.c
index 1f08315e..adbe8292 100644
--- a/src/explorers/DtonsEq.c
+++ b/src/explorers/DtonsEq.c
@@ -18,6 +18,7 @@
 
 #include "DTonsEq.h"
 #include "Activity.h"
+#include "Binary_search.h"
 #include "Bounds.h"
 #include "Constraints.h"
 #include "CoreTransformations.h"
@@ -160,7 +161,7 @@ static inline void can_dton_be_eliminated(Matrix *AT, const double *row_vals,
     abs_pivot = ABS(abs_pivot);
     if (abs_pivot > MAX_RATIO_PIVOT || abs_pivot < 1 / MAX_RATIO_PIVOT)
     {
-        assert("Is this ever triggered now with this small threshold?");
+        assert(0 && "Is this ever triggered now with this small threshold?");
         *stay = -1;
         *subst = -1;
         return;
@@ -236,11 +237,11 @@ static inline void modify_bounds(Constraints *constraints, int i, double aij,
 #ifndef TESTING
 static inline
 #endif
-    Old_and_new_coeff
-    update_row_A_dton(Matrix *A, int i, int q, int j, int k, double aij, double aik,
-                      int *row_size, PostsolveInfo *postsolve_info)
+    Old_and_new_coeff update_row_A_dton(Matrix *A, int i, int q, int j, int k,
+                                        double aij, double aik, int *row_size,
+                                        PostsolveInfo *postsolve_info)
 {
-    int ii, start, end, insertion;
+    int start, end, insertion;
     double old_val = 0.0;
     int subst_idx = -1;
     int diff_row_size = 0;
@@ -252,25 +253,13 @@ static inline
     // find index of variable that is substituted and and the index of
     // insertion of the variable that stays
     // -----------------------------------------------------------------
-    for (ii = start; ii < end; ++ii)
-    {
-        if (A->i[ii] == k)
-        {
-            subst_idx = ii;
-            break;
-        }
-    }
-
-    assert(subst_idx != -1);
+    int row_len = end - start;
+    int rel = sorted_find(A->i + start, row_len, k);
+    assert(rel != -1);
+    subst_idx = start + rel;
 
-    for (ii = start; ii < end; ++ii)
-    {
-        if (A->i[ii] >= j)
-        {
-            insertion = ii;
-            break;
-        }
-    }
+    int rel_ins = sorted_lower_bound(A->i + start, row_len, j);
+    insertion = start + rel_ins;
 
     // -----------------------------------------------------------------
     // Compute new coefficient of the variable that stays.
diff --git a/src/explorers/Parallel_cols.c b/src/explorers/Parallel_cols.c
index 717b72e1..b8191e16 100644
--- a/src/explorers/Parallel_cols.c
+++ b/src/explorers/Parallel_cols.c
@@ -33,7 +33,7 @@
 #include "Workspace.h"
 
 static inline PresolveStatus process_single_bin(const Problem *prob, const int *bin,
-                                                int bin_size)
+                                                int bin_size, int *rows_to_recompute)
 {
     assert(bin_size > 1);
     Constraints *constraints = prob->constraints;
@@ -42,7 +42,6 @@ static inline PresolveStatus process_single_bin(const Problem *prob, const int *
     const Matrix *AT = constraints->AT;
     const RowRange *col_ranges = AT->p;
     const double *c = prob->obj->c;
-    Activity *acts = constraints->state->activities;
     iVec *sub_cols_to_delete = constraints->state->sub_cols_to_delete;
     PostsolveInfo *postsolve_info = constraints->state->postsolve_info;
 
@@ -157,7 +156,7 @@ static inline PresolveStatus process_single_bin(const Problem *prob, const int *
                 }
 
                 // ---------------------------------------------------------------------
-                //                  mark col as substituted
+                //                  mark col k as substituted
                 // ---------------------------------------------------------------------
                 set_col_to_substituted(k, col_tags + k, sub_cols_to_delete);
                 save_retrieval_parallel_col(postsolve_info, ub_j_old, lb_j_old,
@@ -241,11 +240,12 @@ static inline PresolveStatus process_single_bin(const Problem *prob, const int *
                         return UNBNDORINFEAS;
                     }
 
-                    assert(!HAS_TAG(col_tags[k], C_TAG_INACTIVE));
+                    assert(!HAS_TAG(col_tags[j], C_TAG_INACTIVE));
                     assert(!IS_ABS_INF(bounds[j].lb));
                     fix_col(constraints, j, bounds[j].lb, cj);
-                    // break from checking if xj is parallel to other cols since
-                    // we fix it
+                    recount_ninfs = false;
+                    //  break from checking if xj is parallel to other cols since
+                    //  we fix it
                     break;
                 }
                 else if (fix_xj_to_upper)
@@ -255,11 +255,12 @@ static inline PresolveStatus process_single_bin(const Problem *prob, const int *
                         return UNBNDORINFEAS;
                     }
 
-                    assert(!HAS_TAG(col_tags[k], C_TAG_INACTIVE));
+                    assert(!HAS_TAG(col_tags[j], C_TAG_INACTIVE));
                     assert(!IS_ABS_INF(bounds[j].ub));
                     fix_col(constraints, j, bounds[j].ub, cj);
-                    // break from checking if xj is parallel to other cols since
-                    // we fix it
+                    recount_ninfs = false;
+                    //  break from checking if xj is parallel to other cols since
+                    //  we fix it
                     break;
                 }
             }
@@ -269,15 +270,9 @@ static inline PresolveStatus process_single_bin(const Problem *prob, const int *
         // j appears in due to bound change
         if (recount_ninfs)
         {
-            const Matrix *A = constraints->A;
-
             for (jj = 0; jj < len_j; jj++)
             {
-                int row = aj_cols[jj];
-
-                recompute_n_infs(acts + row, A->x + A->p[row].start,
-                                 A->i + A->p[row].start,
-                                 A->p[row].end - A->p[row].start, col_tags);
+                rows_to_recompute[aj_cols[jj]] = 1;
             }
         }
     }
@@ -286,13 +281,17 @@ static inline PresolveStatus process_single_bin(const Problem *prob, const int *
 }
 
 static PresolveStatus process_all_bins(const Problem *prob, const int *parallel_cols,
-                                       const iVec *groups)
+                                       const iVec *groups, int *rows_to_recompute)
 {
     if (groups->len <= 1)
     {
         return UNCHANGED;
     }
 
+    Constraints *constraints = prob->constraints;
+    size_t n_rows = constraints->A->m;
+    memset(rows_to_recompute, 0, n_rows * sizeof(int));
+
     size_t n_groups = groups->len - 1;
     PresolveStatus status = UNCHANGED;
 
@@ -300,7 +299,21 @@ static PresolveStatus process_all_bins(const Problem *prob, const int *parallel_
     {
         int n_cols_this_group = groups->data[i + 1] - groups->data[i];
         status |= process_single_bin(prob, parallel_cols + groups->data[i],
-                                     n_cols_this_group);
+                                     n_cols_this_group, rows_to_recompute);
+    }
+
+    // batch recompute n_infs for all affected rows
+    const Matrix *A = constraints->A;
+    Activity *acts = constraints->state->activities;
+    ColTag *col_tags = constraints->col_tags;
+
+    for (size_t i = 0; i < n_rows; ++i)
+    {
+        if (rows_to_recompute[i])
+        {
+            recompute_n_infs(acts + i, A->x + A->p[i].start, A->i + A->p[i].start,
+                             A->p[i].end - A->p[i].start, col_tags);
+        }
     }
 
     return status;
@@ -321,7 +334,8 @@ PresolveStatus remove_parallel_cols(Problem *prob)
 
     // finding parallel rows of AT is the same as finding parallel cols of A
     find_parallel_rows(constraints->AT, constraints->col_tags, group_starts,
-                       parallel_cols, sparsity_IDs, coeff_hashes, C_TAG_INACTIVE);
+                       parallel_cols, sparsity_IDs, coeff_hashes, C_TAG_INACTIVE,
+                       constraints->state->work->radix_aux);
 #ifndef NDEBUG
     // there should be no inactive cols in group_starts here
     for (int i = 0; i < group_starts->len - 1; ++i)
@@ -336,7 +350,9 @@ PresolveStatus remove_parallel_cols(Problem *prob)
     }
 #endif
 
-    PresolveStatus status = process_all_bins(prob, parallel_cols, group_starts);
+    int *rows_to_recompute = sparsity_IDs;
+    PresolveStatus status =
+        process_all_bins(prob, parallel_cols, group_starts, rows_to_recompute);
     assert(constraints->state->empty_rows->len == 0);
 
     delete_fixed_cols_from_problem(prob);
diff --git a/src/explorers/Parallel_rows.c b/src/explorers/Parallel_rows.c
index f29c8b93..7de5ffc7 100644
--- a/src/explorers/Parallel_rows.c
+++ b/src/explorers/Parallel_rows.c
@@ -28,15 +28,12 @@
 #include "Workspace.h"
 #include "iVec.h"
 #include "limits.h" // for INT_MAX
+#include "radix_sort.h"
 #include "utils.h"
 #include <PSLP_warnings.h>
 #include <math.h> // For round()
 #include <stdint.h>
 
-// global variables needed for qsort
-static int *global_sparsity_IDs;
-static int *global_coeff_hashes;
-
 // djb2 hash function
 static inline uint32_t hash_int_array(const int *arr, int size)
 {
@@ -111,41 +108,10 @@ static inline
     }
 }
 
-int comparator(const void *a, const void *b)
+static void sort_rows(int *rows, size_t n, const int *sparsity_IDs,
+                      const int *coeff_hashes, int *aux)
 {
-    int idxA = *(const int *) a;
-    int idxB = *(const int *) b;
-
-    if (global_sparsity_IDs[idxA] < global_sparsity_IDs[idxB])
-    {
-        return -1;
-    }
-
-    if (global_sparsity_IDs[idxA] > global_sparsity_IDs[idxB])
-    {
-        return 1;
-    }
-
-    if (global_coeff_hashes[idxA] < global_coeff_hashes[idxB])
-    {
-        return -1;
-    }
-
-    if (global_coeff_hashes[idxA] > global_coeff_hashes[idxB])
-    {
-        return 1;
-    }
-
-    return 0;
-}
-
-// Sort function
-static inline void sort_rows(int *rows, size_t n_rows, int *sparsity_IDs,
-                             int *coeff_hashes)
-{
-    global_sparsity_IDs = sparsity_IDs;
-    global_coeff_hashes = coeff_hashes;
-    qsort(rows, n_rows, sizeof(int), comparator);
+    parallel_radix_sort_rows(rows, n, sparsity_IDs, coeff_hashes, aux);
 }
 
 static inline int get_bin_size(int start, size_t n_rows, const int *rows,
@@ -306,7 +272,7 @@ static inline int find_parallel_rows_in_bin(const Matrix *A, const int *bin,
 // replaced rows with parallel_rows for less memory usage
 void find_parallel_rows(const Matrix *A, const RowTag *r_Tags, iVec *group_starts,
                         int *parallel_rows, int *sparsity_IDs, int *coeff_hashes,
-                        RowTag INACTIVE_TAG)
+                        RowTag INACTIVE_TAG, int *radix_aux)
 {
     int i, bin_size;
     int n_p_rows_total = 0;
@@ -320,12 +286,16 @@ void find_parallel_rows(const Matrix *A, const RowTag *r_Tags, iVec *group_start
     // that rows with the same sparsity pattern and the same coefficient hash
     // value end up next to each other.
     // ------------------------------------------------------------------------
+    int n_active = 0;
     for (i = 0; i < A->m; i++)
     {
-        parallel_rows[i] = i;
+        if (sparsity_IDs[i] != INT_MAX)
+        {
+            parallel_rows[n_active++] = i;
+        }
     }
-
-    sort_rows(parallel_rows, A->m, sparsity_IDs, coeff_hashes);
+    sort_rows(parallel_rows, (size_t) n_active, sparsity_IDs, coeff_hashes,
+              radix_aux);
 
 #ifndef NDEBUG
     if (INACTIVE_TAG == R_TAG_INACTIVE)
@@ -341,14 +311,10 @@ void find_parallel_rows(const Matrix *A, const RowTag *r_Tags, iVec *group_start
     // ------------------------------------------------------------------------
     iVec_clear_no_resize(group_starts);
     iVec_append(group_starts, 0);
-    for (i = 0; i < A->m; i++)
+    for (i = 0; i < n_active; i++)
     {
-        if (HAS_TAG(r_Tags[parallel_rows[i]], INACTIVE_TAG))
-        {
-            break;
-        }
-
-        bin_size = get_bin_size(i, A->m, parallel_rows, sparsity_IDs, coeff_hashes);
+        bin_size = get_bin_size(i, (size_t) n_active, parallel_rows, sparsity_IDs,
+                                coeff_hashes);
 
         // find the parallel rows in the bin
         if (bin_size > 1)
@@ -610,7 +576,8 @@ PresolveStatus remove_parallel_rows(Constraints *constraints)
     iVec *group_starts = constraints->state->work->int_vec;
 
     find_parallel_rows(constraints->A, constraints->row_tags, group_starts,
-                       parallel_rows, sparsity_IDs, coeff_hashes, R_TAG_INACTIVE);
+                       parallel_rows, sparsity_IDs, coeff_hashes, R_TAG_INACTIVE,
+                       constraints->state->work->radix_aux);
 
     PresolveStatus status =
         process_all_bins(constraints, parallel_rows, group_starts);
diff --git a/src/explorers/Simple_dual_fix.c b/src/explorers/Simple_dual_fix.c
index dda15678..4f6c6d53 100644
--- a/src/explorers/Simple_dual_fix.c
+++ b/src/explorers/Simple_dual_fix.c
@@ -35,6 +35,7 @@ static inline PresolveStatus _simple_dual_fix(Constraints *constraints, double c
                                               iVec *cols_to_inf)
 {
     assert(!HAS_TAG(col_tag, C_TAG_INACTIVE));
+
     // --------------------------------------------------
     // see if we can fix the variable to its lower bound
     // --------------------------------------------------
diff --git a/src/explorers/StonCols.c b/src/explorers/StonCols.c
index 5f070980..4cf05e36 100644
--- a/src/explorers/StonCols.c
+++ b/src/explorers/StonCols.c
@@ -306,6 +306,7 @@ static PresolveStatus process_colston_eq(RowView *row, ColView *col, Objective *
             we only call this function with val equal to one of the
             bounds.
  */
+/*
 static void fix_col_ston(double val, double Aik, RowView *row, ColView *col,
                          Activity *act, Objective *obj, size_t *A_nnz,
                          const Bound *bounds, PostsolveInfo *postsolve_info)
@@ -372,12 +373,12 @@ static void fix_col_ston(double val, double Aik, RowView *row, ColView *col,
     save_retrieval_fixed_col(postsolve_info, col->k, val, obj->c[col->k], &Aik,
                              &row->i, 1);
 }
+*/
 
 static inline PresolveStatus
 process_colston_ineq(RowView *row, ColView *col, Objective *obj, double Aik,
                      bool impl_free_from_above, bool impl_free_from_below,
-                     Activity *act, Lock *locks, iVec *rows_to_delete, size_t *A_nnz,
-                     const Bound *bounds, iVec *ston_rows, iVec *dton_rows,
+                     Lock *locks, iVec *rows_to_delete, iVec *dton_rows,
                      PostsolveInfo *postsolve_info)
 {
     bool is_lhs_inf = HAS_TAG(*row->tag, R_TAG_LHS_INF);
@@ -386,54 +387,13 @@ process_colston_ineq(RowView *row, ColView *col, Objective *obj, double Aik,
     bool is_lb_inf = HAS_TAG(*col->tag, C_TAG_LB_INF);
     double ck = obj->c[col->k];
 
-    // ------------------------------------------------------------------------
-    //                    dual fix to lower bound
-    // ------------------------------------------------------------------------
-    if ((ck > 0 && Aik > 0 && is_lhs_inf) || (ck > 0 && Aik < 0 && is_rhs_inf))
-    {
-        if (is_lb_inf)
-        {
-            return UNBNDORINFEAS;
-        }
-        assert(!IS_ABS_INF(*col->lb));
-
-        fix_col_ston(*col->lb, Aik, row, col, act, obj, A_nnz, bounds,
-                     postsolve_info);
-
-        // should not check for dton row here since the constraint is an ineq
-        if (*row->len == 1)
-        {
-            assert(!iVec_contains(ston_rows, row->i));
-            iVec_append(ston_rows, row->i);
-        }
-
-        assert(*row->len != 0);
-        return REDUCED;
-    }
-
-    // ------------------------------------------------------------------------
-    //                    dual fix to upper bound
-    // ------------------------------------------------------------------------
-    if ((ck < 0 && Aik < 0 && is_lhs_inf) || (ck < 0 && Aik > 0 && is_rhs_inf))
+    /* unboundedness check */
+    if ((ck > 0 && Aik > 0 && is_lhs_inf && is_lb_inf) ||
+        (ck > 0 && Aik < 0 && is_rhs_inf && is_lb_inf) ||
+        (ck < 0 && Aik < 0 && is_lhs_inf && is_ub_inf) ||
+        (ck < 0 && Aik > 0 && is_rhs_inf && is_ub_inf))
     {
-        if (is_ub_inf)
-        {
-            return UNBNDORINFEAS;
-        }
-        assert(!IS_ABS_INF(*col->ub));
-
-        fix_col_ston(*col->ub, Aik, row, col, act, obj, A_nnz, bounds,
-                     postsolve_info);
-
-        // should not check for dton row here since the constraint is an ineq
-        if (*row->len == 1)
-        {
-            assert(!iVec_contains(ston_rows, row->i));
-            iVec_append(ston_rows, row->i);
-        }
-
-        assert(*row->len != 0);
-        return REDUCED;
+        return UNBNDORINFEAS;
     }
 
     // ------------------------------------------------------------------------
@@ -667,8 +627,8 @@ PresolveStatus remove_ston_cols__(Problem *prob)
         {
             status |= process_colston_ineq(
                 &row_view, &col_view, prob->obj, Aik, impl_free_from_above,
-                impl_free_from_below, acts + i, locks, rows_to_delete, A_nnz, bounds,
-                ston_rows, dton_rows, postsolve_info);
+                impl_free_from_below, locks, rows_to_delete, dton_rows,
+                postsolve_info);
         }
     }
 
diff --git a/tests/all_tests.c b/tests/all_tests.c
index 2d6f0c11..1fcb3bd4 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -10,6 +10,7 @@
 #include "test_iVec.h"
 #include "test_pathological.h"
 #include "test_postsolve.h"
+#include "test_radix_sort.h"
 #include "test_ston.h"
 
 const char *run_all_tests()
@@ -22,14 +23,16 @@ const char *run_all_tests()
     mu_assert("ston error", test_ston());
     mu_assert("simple reductions error", test_simple());
     mu_assert("domain propagation error", test_domain());
+    mu_assert("radix_sort error", test_radix_sort());
     mu_assert("presolver error", test_presolver());
     mu_assert("pathological error", test_pathological());
+    mu_assert("parallel_cols error", test_parallel_cols());
 
 #ifndef _WIN32
-    /* windows build seems to keep different order of parallel rows/cols */
+    /* windows build is a bit weird in debug mode */
     mu_assert("parallel_rows error", test_parallel_rows());
-    mu_assert("parallel_cols error", test_parallel_cols());
 #endif
+
     mu_assert("postsolve error", test_postsolve());
 
     return NULL;
diff --git a/tests/test_Constraints.h b/tests/test_Constraints.h
index d0dec3b3..76ceb7a5 100644
--- a/tests/test_Constraints.h
+++ b/tests/test_Constraints.h
@@ -61,8 +61,10 @@ static char *test_1_constraints()
     mu_assert("error row_sizes", ARRAYS_EQUAL_INT(row_sizes, correct_row_sizes, 3));
 
     int work_map[4] = {0};
-    remove_extra_space(A, row_sizes, col_sizes, true, work_map);
-    remove_extra_space(AT, col_sizes, row_sizes, true, work_map);
+    int new_n_cols = update_column_map(col_sizes, work_map, 4);
+    remove_extra_space(A, row_sizes, true, work_map, new_n_cols);
+    int new_n_rows = update_column_map(row_sizes, work_map, 3);
+    remove_extra_space(AT, col_sizes, true, work_map, new_n_rows);
 
     double A_vals_correct[] = {2, -1, 1, 1, 1, 1};
     int A_cols_correct[] = {0, 3, 0, 1, 2, 3};
@@ -130,7 +132,7 @@ static char *test_2_constraints()
         constraints_new(A, AT, lhs, rhs, bounds, data, row_tags, col_tags);
 
     // remove rows 1, 3, 5
-    iVec_append_array(data->rows_to_delete, (int[]){1, 3, 5}, 3);
+    iVec_append_array(data->rows_to_delete, (int[]) {1, 3, 5}, 3);
     delete_inactive_rows(constraints);
 
     // test for correctness
@@ -142,8 +144,10 @@ static char *test_2_constraints()
     mu_assert("error row_sizes", ARRAYS_EQUAL_INT(row_sizes, correct_row_sizes, 10));
 
     int work_map[10] = {0};
-    remove_extra_space(A, row_sizes, col_sizes, true, work_map);
-    remove_extra_space(AT, col_sizes, row_sizes, true, work_map);
+    int new_n_cols = update_column_map(col_sizes, work_map, 10);
+    remove_extra_space(A, row_sizes, true, work_map, new_n_cols);
+    int new_n_rows = update_column_map(row_sizes, work_map, 10);
+    remove_extra_space(AT, col_sizes, true, work_map, new_n_rows);
 
     double A_vals_correct[] = {1.3,   -0.6,  0.26,  -0.42, 1.31,  0.47, -1.47, 0.63,
                                -0.93, -1.46, -2.37, 0.25,  -1.45, -0.5, 0.61,  1.38,
@@ -221,8 +225,10 @@ static char *test_3_constraints()
     mu_assert("error row_sizes", ARRAYS_EQUAL_INT(row_sizes, correct_row_sizes, 3));
 
     int work_map[4] = {0};
-    remove_extra_space(A, row_sizes, col_sizes, true, work_map);
-    remove_extra_space(AT, col_sizes, row_sizes, true, work_map);
+    int new_n_cols = update_column_map(col_sizes, work_map, 4);
+    remove_extra_space(A, row_sizes, true, work_map, new_n_cols);
+    int new_n_rows = update_column_map(row_sizes, work_map, 3);
+    remove_extra_space(AT, col_sizes, true, work_map, new_n_rows);
 
     double A_vals_correct[] = {1, 1, 2, -1, 1, 1, 1};
     int A_cols_correct[] = {0, 1, 0, 2, 0, 1, 2};
@@ -286,7 +292,7 @@ static char *test_4_constraints()
         constraints_new(A, AT, lhs, rhs, bounds, data, row_tags, col_tags);
 
     // remove cols 1, 3, 5
-    iVec_append_array(data->fixed_cols_to_delete, (int[]){1, 3, 5}, 3);
+    iVec_append_array(data->fixed_cols_to_delete, (int[]) {1, 3, 5}, 3);
     delete_inactive_cols_from_A_and_AT(constraints);
 
     // test for correctness
@@ -298,8 +304,10 @@ static char *test_4_constraints()
     mu_assert("error row_sizes", ARRAYS_EQUAL_INT(row_sizes, correct_row_sizes, 10));
 
     int work_map[10] = {0};
-    remove_extra_space(A, row_sizes, col_sizes, true, work_map);
-    remove_extra_space(AT, col_sizes, row_sizes, true, work_map);
+    int new_n_cols = update_column_map(col_sizes, work_map, 10);
+    remove_extra_space(A, row_sizes, true, work_map, new_n_cols);
+    int new_n_rows = update_column_map(row_sizes, work_map, 10);
+    remove_extra_space(AT, col_sizes, true, work_map, new_n_rows);
 
     double A_vals_correct[] = {1.12, -0.29, -0.35, -0.72, 0.82, 0.32,  1.38,  -0.44,
                                0.05, 1.17,  0.21,  0.68,  1.38, -0.56, -0.33, -0.1,
diff --git a/tests/test_Matrix.h b/tests/test_Matrix.h
index 0ebf3ebf..a657954a 100644
--- a/tests/test_Matrix.h
+++ b/tests/test_Matrix.h
@@ -1,6 +1,7 @@
 #ifndef TEST_MATRIX_H
 #define TEST_MATRIX_H
 
+#include "Constraints.h"
 #include "Debugger.h"
 #include "Matrix.h"
 #include "minunit.h"
@@ -53,8 +54,9 @@ static char *test_1_matrix()
     // remove extra space to simplify test
     int row_sizes_AT[6] = {2, 3, 3, 1, 2};
     int col_sizes_AT[6] = {3, 3, 1, 2, 1, 1};
-    int col_idxs_map_AT[6];
-    remove_extra_space(AT, row_sizes_AT, col_sizes_AT, true, col_idxs_map_AT);
+    int col_idxs_map_AT[6] = {0};
+    int n_new_cols_AT = update_column_map(col_sizes_AT, col_idxs_map_AT, 6);
+    remove_extra_space(AT, row_sizes_AT, true, col_idxs_map_AT, n_new_cols_AT);
 
     // correct answer
     double AT_vals_correct[] = {1, 3, -1, 1, 1, 2, 1, 1, 1, 1, 1};
@@ -619,7 +621,8 @@ static char *test_15_matrix()
     int row_sizes[] = {3, 2, 2, 3};
     int col_sizes[] = {4, 1, 4, 1};
     int map[4];
-    remove_extra_space(A, row_sizes, col_sizes, true, map);
+    int new_n_cols = update_column_map(col_sizes, map, 4);
+    remove_extra_space(A, row_sizes, true, map, new_n_cols);
 
     // correct answer
     double vals_correct[] = {9, 8, 1, 2, 9, 7, 3, 4, 1, 1};
@@ -665,7 +668,8 @@ static char *test_16_matrix()
     int row_sizes[] = {3, 3, 1, 2};
     int col_sizes[] = {4, 2, 3, 1};
     int map[4];
-    remove_extra_space(A, row_sizes, col_sizes, true, map);
+    int n_new_cols = update_column_map(col_sizes, map, 4);
+    remove_extra_space(A, row_sizes, true, map, n_new_cols);
 
     // correct answer
     double vals_correct[] = {9, 5, 4, 3, 7, 5, 1, 1, 1};
@@ -709,7 +713,8 @@ static char *test_17_matrix()
     int row_sizes[] = {3, 2, 2};
     int col_sizes[] = {3, 1, 1, 2};
     int map[4];
-    remove_extra_space(A, row_sizes, col_sizes, true, map);
+    int n_new_cols = update_column_map(col_sizes, map, 4);
+    remove_extra_space(A, row_sizes, true, map, n_new_cols);
 
     double vals_correct[] = {1, 2, 3, 4, 5, 3, 1};
     int cols_correct[] = {0, 2, 3, 0, 1, 0, 3};
@@ -756,7 +761,8 @@ static char *test_18_matrix()
     int row_sizes[] = {3, 3, 2};
     int col_sizes[] = {3, 1, 2, 2};
     int map[4];
-    remove_extra_space(A, row_sizes, col_sizes, true, map);
+    int n_new_cols = update_column_map(col_sizes, map, 4);
+    remove_extra_space(A, row_sizes, true, map, n_new_cols);
 
     mu_assert("error, vals not equal", ARRAYS_EQUAL_DOUBLE(vals, A->x, A->nnz));
 
diff --git a/tests/test_Parallel_cols.h b/tests/test_Parallel_cols.h
index 57d7562a..d1bf3155 100644
--- a/tests/test_Parallel_cols.h
+++ b/tests/test_Parallel_cols.h
@@ -40,12 +40,6 @@ static char *test_1_parallel_cols()
     remove_parallel_cols(prob);
     problem_clean(prob, true);
 
-    // print A->x
-    for (size_t i = 0; i < A->nnz; ++i)
-    {
-        printf("A->x[%zu] = %f\n", i, A->x[i]);
-    }
-
     // check that new A is correct
     double Ax_correct[] = {-1, -1};
     int Ai_correct[] = {0, 1};
@@ -267,23 +261,23 @@ static char *test_5_parallel_cols()
     problem_clean(prob, true);
 
     // check that new A is correct
-    double Ax_correct[] = {-1, -1};
-    int Ai_correct[] = {0, 1};
-    int Ap_correct[] = {0, 2};
-    mu_assert("error Ax", ARRAYS_EQUAL_DOUBLE(Ax_correct, A->x, 2));
-    mu_assert("error Ai", ARRAYS_EQUAL_INT(Ai_correct, A->i, 2));
+    double Ax_correct[] = {-1};
+    int Ai_correct[] = {0};
+    int Ap_correct[] = {0, 1};
+    mu_assert("error Ax", ARRAYS_EQUAL_DOUBLE(Ax_correct, A->x, 1));
+    mu_assert("error Ai", ARRAYS_EQUAL_INT(Ai_correct, A->i, 1));
     mu_assert("rows", check_row_starts(A, Ap_correct));
 
     // check that new variable bounds are correct
-    double lbs_correct[] = {-INF, 0};
-    double ubs_correct[] = {11, 5};
+    double lbs_correct[] = {-INF};
+    double ubs_correct[] = {11};
     mu_assert("error bounds",
-              check_bounds(constraints->bounds, lbs_correct, ubs_correct, 2));
+              check_bounds(constraints->bounds, lbs_correct, ubs_correct, 1));
 
     // check that the objective function is correct
-    double obj_correct[] = {2, 1};
-    mu_assert("error obj", ARRAYS_EQUAL_DOUBLE(obj_correct, prob->obj->c, 2));
-    mu_assert("error offset", prob->obj->offset == 0);
+    double obj_correct[] = {2};
+    mu_assert("error obj", ARRAYS_EQUAL_DOUBLE(obj_correct, prob->obj->c, 1));
+    mu_assert("error offset", prob->obj->offset == 5);
 
     PS_FREE(stgs);
     DEBUG(run_debugger(constraints, false));
@@ -348,7 +342,7 @@ static char *test_6_parallel_cols()
 /* Based on example 8 gurobi paper
     min. [4 -2 1]x
     s.t. [2 -1 -1]x <= -10
-         [-INF, 0, 0] <= x <= [4 3 5]
+         [-INF, 0, -1] <= x <= [4 3 5]
 */
 static char *test_5_negated_parallel_cols()
 {
@@ -361,7 +355,7 @@ static char *test_5_negated_parallel_cols()
 
     double lhs[] = {-INF};
     double rhs[] = {-10};
-    double lbs[] = {-INF, 0, 0};
+    double lbs[] = {-INF, 0, -1};
     double ubs[] = {4, 3, 5};
     double c[] = {4, -2, 1};
 
@@ -376,23 +370,23 @@ static char *test_5_negated_parallel_cols()
     problem_clean(prob, true);
 
     // check that new A is correct
-    double Ax_correct[] = {-1, -1};
-    int Ai_correct[] = {0, 1};
-    int Ap_correct[] = {0, 2};
-    mu_assert("error Ax", ARRAYS_EQUAL_DOUBLE(Ax_correct, A->x, 2));
-    mu_assert("error Ai", ARRAYS_EQUAL_INT(Ai_correct, A->i, 2));
+    double Ax_correct[] = {-1};
+    int Ai_correct[] = {0};
+    int Ap_correct[] = {0, 1};
+    mu_assert("error Ax", ARRAYS_EQUAL_DOUBLE(Ax_correct, A->x, 1));
+    mu_assert("error Ai", ARRAYS_EQUAL_INT(Ai_correct, A->i, 1));
     mu_assert("rows", check_row_starts(A, Ap_correct));
 
     // check that new variable bounds are correct
-    double lbs_correct[] = {-8, 0};
-    double ubs_correct[] = {INF, 5};
+    double lbs_correct[] = {-8};
+    double ubs_correct[] = {INF};
     mu_assert("error bounds",
-              check_bounds(constraints->bounds, lbs_correct, ubs_correct, 2));
+              check_bounds(constraints->bounds, lbs_correct, ubs_correct, 1));
 
     // check that the objective function is correct
-    double obj_correct[] = {-2, 1};
-    mu_assert("error obj", ARRAYS_EQUAL_DOUBLE(obj_correct, prob->obj->c, 2));
-    mu_assert("error offset", prob->obj->offset == 0);
+    double obj_correct[] = {-2};
+    mu_assert("error obj", ARRAYS_EQUAL_DOUBLE(obj_correct, prob->obj->c, 1));
+    mu_assert("error offset", prob->obj->offset == -1);
 
     PS_FREE(stgs);
     DEBUG(run_debugger(constraints, false));
@@ -403,7 +397,7 @@ static char *test_5_negated_parallel_cols()
 /* Based on example 8 gurobi paper
     min. [4 -2 1]x
     s.t. [2 -1 -1]x <= -10
-         [-INF, -INF, 0] <= x <= [4 3 5]
+         [-INF, -INF, -2] <= x <= [4 3 5]
 */
 static char *test_6_negated_parallel_cols()
 {
@@ -416,7 +410,7 @@ static char *test_6_negated_parallel_cols()
 
     double lhs[] = {-INF};
     double rhs[] = {-10};
-    double lbs[] = {-INF, -INF, 0};
+    double lbs[] = {-INF, -INF, -2};
     double ubs[] = {4, 3, 5};
     double c[] = {4, -2, 1};
 
@@ -427,27 +421,27 @@ static char *test_6_negated_parallel_cols()
     Problem *prob = presolver->prob;
     Constraints *constraints = prob->constraints;
     Matrix *A = constraints->A;
-    remove_parallel_cols(prob);
+    PresolveStatus status = remove_parallel_cols(prob);
     problem_clean(prob, true);
 
     // check that new A is correct
-    double Ax_correct[] = {-1, -1};
-    int Ai_correct[] = {0, 1};
-    int Ap_correct[] = {0, 2};
-    mu_assert("error Ax", ARRAYS_EQUAL_DOUBLE(Ax_correct, A->x, 2));
-    mu_assert("error Ai", ARRAYS_EQUAL_INT(Ai_correct, A->i, 2));
+    double Ax_correct[] = {-1};
+    int Ai_correct[] = {0};
+    int Ap_correct[] = {0, 1};
+    mu_assert("error Ax", ARRAYS_EQUAL_DOUBLE(Ax_correct, A->x, 1));
+    mu_assert("error Ai", ARRAYS_EQUAL_INT(Ai_correct, A->i, 1));
     mu_assert("rows", check_row_starts(A, Ap_correct));
 
     // check that new variable bounds are correct
-    double lbs_correct[] = {-INF, 0};
-    double ubs_correct[] = {INF, 5};
+    double lbs_correct[] = {-INF};
+    double ubs_correct[] = {INF};
     mu_assert("error bounds",
-              check_bounds(constraints->bounds, lbs_correct, ubs_correct, 2));
+              check_bounds(constraints->bounds, lbs_correct, ubs_correct, 1));
 
     // check that the objective function is correct
-    double obj_correct[] = {-2, 1};
-    mu_assert("error obj", ARRAYS_EQUAL_DOUBLE(obj_correct, prob->obj->c, 2));
-    mu_assert("error offset", prob->obj->offset == 0);
+    double obj_correct[] = {-2};
+    mu_assert("error obj", ARRAYS_EQUAL_DOUBLE(obj_correct, prob->obj->c, 1));
+    mu_assert("error offset", prob->obj->offset == -2);
 
     PS_FREE(stgs);
     DEBUG(run_debugger(constraints, false));
@@ -555,9 +549,6 @@ static char *test_8_parallel_cols()
                         5, 0, 1, 2, 3, 4, 5, 0, 5, 0, 5, 0, 5};
     int Ap_correct[] = {0, 6, 9, 15, 18, 24, 29, 35, 37, 39, 41};
 
-// on mac the order of the columns is different, so we only run this test on
-// linux
-#ifdef __linux__
     mu_assert("error Ax", ARRAYS_EQUAL_DOUBLE(Ax_correct, A->x, A->nnz));
     mu_assert("error Ai", ARRAYS_EQUAL_INT(Ai_correct, A->i, A->nnz));
     mu_assert("rows", check_row_starts(A, Ap_correct));
@@ -572,7 +563,6 @@ static char *test_8_parallel_cols()
     double obj_correct[] = {3, -4, -3, 6, 3, 1};
     mu_assert("error obj", ARRAYS_EQUAL_DOUBLE(obj_correct, prob->obj->c, 6));
     mu_assert("error offset", prob->obj->offset == 0);
-#endif // __linux__
 
     PS_FREE(stgs);
     DEBUG(run_debugger(constraints, false));
diff --git a/tests/test_Parallel_rows.h b/tests/test_Parallel_rows.h
index f5c8b20a..c358f23b 100644
--- a/tests/test_Parallel_rows.h
+++ b/tests/test_Parallel_rows.h
@@ -100,26 +100,19 @@ static char *test_3_parallel_rows()
     int parallel_rows[5];
     int sparsity_IDs[5];
     int coeff_hashes[5];
+    int radix_aux[5];
 
     find_parallel_rows(A, row_tags, group_start, parallel_rows, sparsity_IDs,
-                       coeff_hashes, R_TAG_INACTIVE);
+                       coeff_hashes, R_TAG_INACTIVE, radix_aux);
 
-#ifdef _WIN32
     int parallel_rows_correct[] = {0, 4, 1, 3};
-#else
-    int parallel_rows_correct[] = {4, 0, 3, 1};
-#endif
-
     int group_start_correct[] = {0, 2, 4};
     mu_assert("error", ARRAYS_EQUAL_INT(parallel_rows_correct, parallel_rows, 4));
     mu_assert("error", group_start->len == 3);
     mu_assert("error", ARRAYS_EQUAL_INT(group_start_correct, group_start->data, 3));
 
-    printf("end of test_3_parallel_rows\n");
-
     free_matrix(A);
     iVec_free(group_start);
-    printf("finished freeing memory test 3 \n");
     return 0;
 }
 
@@ -207,9 +200,10 @@ static char *test_4_parallel_rows()
     int parallel_rows[1000];
     int sparsity_IDs[1000];
     int coeff_hashes[1000];
+    int radix_aux[1000];
 
     find_parallel_rows(A, row_tags, group_start, parallel_rows, sparsity_IDs,
-                       coeff_hashes, R_TAG_INACTIVE);
+                       coeff_hashes, R_TAG_INACTIVE, radix_aux);
 
     mu_assert("error group_starts len", group_start->len == 4);
     int group_start_correct[] = {0, 8, 16, 24};
@@ -373,6 +367,7 @@ static char *test_5_parallel_rows()
     int parallel_rows[1000] = {0};
     int sparsity_IDs[1000] = {0};
     int coeff_hashes[1000] = {0};
+    int radix_aux[1000];
 
     row_tags[0] = R_TAG_INACTIVE;
     row_tags[200] = R_TAG_INACTIVE;
@@ -380,7 +375,7 @@ static char *test_5_parallel_rows()
     row_tags[310] = R_TAG_INACTIVE;
 
     find_parallel_rows(A, row_tags, group_start, parallel_rows, sparsity_IDs,
-                       coeff_hashes, R_TAG_INACTIVE);
+                       coeff_hashes, R_TAG_INACTIVE, radix_aux);
 
     mu_assert("error group_starts len", group_start->len == 4);
     // int group_start_correct[] = {0, 8, 14, 20};
@@ -934,7 +929,7 @@ static char *test_14_parallel_rows()
 
     // check A matrix
     double Ax_correct[] = {row0_vals[0], row0_vals[1], row0_vals[2], row0_vals[3],
-                           row0_vals[4], ax[0],        ax[1],        ax[2],
+                           row0_vals[4], row3_vals[0], row3_vals[1], row3_vals[2],
                            row5_vals[0], row5_vals[1], row5_vals[2], row5_vals[3]};
 
     int Ai_correct[] = {row0_cols[0], row0_cols[1], row0_cols[2], row0_cols[3],
@@ -942,12 +937,6 @@ static char *test_14_parallel_rows()
                         row5_cols[0], row5_cols[1], row5_cols[2], row5_cols[3]};
     int Ap_correct[] = {0, 5, 8, 12};
 
-    // printf("A->x:\n");
-    // print_double_array(A->x, A->nnz);
-    // printf("Ax-correct:\n");
-    // print_double_array(Ax_correct, 12);
-    // fflush(stdout);
-
     mu_assert("error Ax", ARRAYS_EQUAL_DOUBLE(Ax_correct, A->x, 12));
     mu_assert("error Ai", ARRAYS_EQUAL_INT(Ai_correct, A->i, 12));
     mu_assert("rows", check_row_starts(A, Ap_correct));
@@ -958,8 +947,8 @@ static char *test_14_parallel_rows()
               ARRAYS_EQUAL_ROWTAG(row_tags_correct, constraints->row_tags, 3));
 
     // check lhs and rhs
-    double lhs_correct[] = {-2.1, b1, -INF};
-    double rhs_correct[] = {3.1, b1, 5};
+    double lhs_correct[] = {-2.1, q2 * b1, -INF};
+    double rhs_correct[] = {3.1, q2 * b1, 5};
     mu_assert("error lhs", ARRAYS_EQUAL_DOUBLE(lhs_correct, constraints->lhs, 3));
     mu_assert("error rhs", ARRAYS_EQUAL_DOUBLE(rhs_correct, constraints->rhs, 3));
 
diff --git a/tests/test_Presolver.h b/tests/test_Presolver.h
index ea9ddd72..e4d0b320 100644
--- a/tests/test_Presolver.h
+++ b/tests/test_Presolver.h
@@ -77,6 +77,8 @@ static char *test_01_presolver()
                   0., 0.,    0., 0., 0., 0.,   -0.48, 0., 0., 10.};
 
     Settings *stgs = default_settings();
+    stgs->parallel_rows = false;
+    stgs->parallel_cols = false;
     Presolver *presolver =
         new_presolver(Ax, Ai, Ap, n_rows, n_cols, nnz, lhs, rhs, lbs, ubs, c, stgs);
     mu_assert("Presolver initialization failed", presolver != NULL);
diff --git a/tests/test_dton.h b/tests/test_dton.h
index 2e8e353c..faee42cf 100644
--- a/tests/test_dton.h
+++ b/tests/test_dton.h
@@ -5,6 +5,7 @@
 #include "Debugger.h"
 #include "PSLP_API.h"
 
+#include "Constraints.h"
 #include "Problem.h"
 #include "Workspace.h"
 #include "debug_macros.h"
@@ -45,7 +46,8 @@ static char *test_00_dton()
 
     int col_sizes[] = {SIZE_INACTIVE_COL, 2, 3, 3};
     int map[4] = {0};
-    remove_extra_space(A, row_sizes, col_sizes, true, map);
+    int new_n_cols = update_column_map(col_sizes, map, 4);
+    remove_extra_space(A, row_sizes, true, map, new_n_cols);
 
     // check that new A is correct
     double Ax_correct[] = {-2, 1, 1, -1, 1, 1, 1, 1};
@@ -94,7 +96,8 @@ static char *test_01_dton()
 
     int col_sizes[] = {3, 3, SIZE_INACTIVE_COL, 2, 3, 3};
     int map[6] = {0};
-    remove_extra_space(A, row_sizes, col_sizes, true, map);
+    int n_new_cols = update_column_map(col_sizes, map, 6);
+    remove_extra_space(A, row_sizes, true, map, n_new_cols);
 
     // check that new A is correct
     double Ax_correct[] = {1, 2, -2, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1};
@@ -143,7 +146,8 @@ static char *test_02_dton()
 
     int col_sizes[] = {3, 3, 3, 3, SIZE_INACTIVE_COL, 2};
     int map[6] = {0};
-    remove_extra_space(A, row_sizes, col_sizes, true, map);
+    int n_new_cols = update_column_map(col_sizes, map, 6);
+    remove_extra_space(A, row_sizes, true, map, n_new_cols);
 
     // check that new A is correct
     double Ax_correct[] = {1, 2, 1, 1, -2, 1, 1, 1, 1, -1, 1, 1, 1, 1};
@@ -191,7 +195,8 @@ static char *test_03_dton()
 
     int col_sizes[] = {2, SIZE_INACTIVE_COL, 3, 3};
     int map[4] = {0};
-    remove_extra_space(A, row_sizes, col_sizes, true, map);
+    int n_new_cols = update_column_map(col_sizes, map, 4);
+    remove_extra_space(A, row_sizes, true, map, n_new_cols);
 
     // check that new A is correct
     double Ax_correct[] = {1, 1, 1, 0.5, 1, 1, 1, 1};
@@ -239,7 +244,8 @@ static char *test_04_dton()
 
     int col_sizes[] = {3, 3, 2, SIZE_INACTIVE_COL, 3, 3};
     int map[6] = {0};
-    remove_extra_space(A, row_sizes, col_sizes, true, map);
+    int n_new_cols = update_column_map(col_sizes, map, 6);
+    remove_extra_space(A, row_sizes, true, map, n_new_cols);
 
     // check that new A is correct
     double Ax_correct[] = {1, 2, 1, 1, 1, 1, 1, 0.5, 1, 1, 1, 1, 1, 1};
@@ -287,7 +293,8 @@ static char *test_05_dton()
 
     int col_sizes[] = {3, 3, 3, 3, 2, SIZE_INACTIVE_COL};
     int map[6] = {0};
-    remove_extra_space(A, row_sizes, col_sizes, true, map);
+    int n_new_cols = update_column_map(col_sizes, map, 6);
+    remove_extra_space(A, row_sizes, true, map, n_new_cols);
 
     // check that new A is correct
     double Ax_correct[] = {1, 2, 1, 1, 1, 1, 1, 1, 1, 0.5, 1, 1, 1, 1};
@@ -323,7 +330,8 @@ static char *test_06_dton()
     int col_sizes[] = {3, 2, 3, 3};
     int map[4] = {0};
 
-    remove_extra_space(A, row_sizes, col_sizes, true, map);
+    // int n_new_cols = update_column_map(col_sizes, map, 4);
+    // remove_extra_space(A, row_sizes, col_sizes, true, map, n_new_cols);
 
     int stay = 1;
     int subst = 0;
@@ -338,7 +346,8 @@ static char *test_06_dton()
     mu_assert("error row_sizes", ARRAYS_EQUAL_INT(row_sizes_correct, row_sizes, 3));
 
     int col_sizes_new[] = {SIZE_INACTIVE_COL, 2, 3, 3};
-    remove_extra_space(A, row_sizes, col_sizes_new, true, map);
+    int n_new_cols = update_column_map(col_sizes_new, map, 4);
+    remove_extra_space(A, row_sizes, true, map, n_new_cols);
 
     // check that new A is correct
     double Ax_correct[] = {-2, 1, 1, -1, 1, 1, 1, 1};
@@ -1548,8 +1557,10 @@ static char *test_17_dton()
     Constraints *constraints = prob->constraints;
     Matrix *A = constraints->A;
     State *data = constraints->state;
-    remove_extra_space(A, data->row_sizes, data->col_sizes, true,
-                       data->work->mappings->cols);
+    int new_n_cols =
+        update_column_map(data->col_sizes, data->work->mappings->cols, 5);
+    remove_extra_space(A, data->row_sizes, true, data->work->mappings->cols,
+                       new_n_cols);
 
     remove_dton_eq_rows(prob, 0);
     problem_clean(prob, true);
@@ -1605,10 +1616,14 @@ static char *test_18_dton()
     Matrix *A = constraints->A;
     Matrix *AT = constraints->AT;
     State *data = constraints->state;
-    remove_extra_space(A, data->row_sizes, data->col_sizes, true,
-                       data->work->mappings->cols);
-    remove_extra_space(AT, data->col_sizes, data->row_sizes, true,
-                       data->work->mappings->rows);
+    int new_n_cols =
+        update_column_map(data->col_sizes, data->work->mappings->cols, 5);
+    int new_n_rows =
+        update_column_map(data->row_sizes, data->work->mappings->rows, 3);
+    remove_extra_space(A, data->row_sizes, true, data->work->mappings->cols,
+                       new_n_cols);
+    remove_extra_space(AT, data->col_sizes, true, data->work->mappings->rows,
+                       new_n_rows);
 
     remove_dton_eq_rows(prob, 0);
     problem_clean(prob, true);
@@ -1669,8 +1684,10 @@ static char *test_19_dton()
     Constraints *constraints = prob->constraints;
     Matrix *A = constraints->A;
     State *data = constraints->state;
-    remove_extra_space(A, data->row_sizes, data->col_sizes, true,
-                       data->work->mappings->cols);
+    int new_n_cols =
+        update_column_map(data->col_sizes, data->work->mappings->cols, 5);
+    remove_extra_space(A, data->row_sizes, true, data->work->mappings->cols,
+                       new_n_cols);
     remove_dton_eq_rows(prob, 0);
     problem_clean(prob, true);
 
@@ -1704,13 +1721,9 @@ static const char *all_tests_dton()
     //   mu_run_test(test_12_dton, counter_dton);
     mu_run_test(test_13_dton, counter_dton); // implemented
     mu_run_test(test_14_dton, counter_dton); // implemented
-    printf("before test 15\n");
     mu_run_test(test_15_dton, counter_dton); // implemented
-    printf("after test 15\n");
     mu_run_test(test_16_dton, counter_dton); // implemented
-    printf("after test 16\n");
     mu_run_test(test_17_dton, counter_dton); // implemented
-    printf("after test 17\n");
     mu_run_test(test_18_dton, counter_dton); // implemented
     //    mu_run_test(test_19_dton, counter_dton); // implemented but we don't
     //    run it
diff --git a/tests/test_postsolve.h b/tests/test_postsolve.h
index 3f19267c..c4c62d0b 100644
--- a/tests/test_postsolve.h
+++ b/tests/test_postsolve.h
@@ -434,7 +434,7 @@ static char *test_col_ston_dual_fix()
     set_settings_true(stgs);
     stgs->primal_propagation = false;
     stgs->parallel_cols = false;
-    stgs->dual_fix = false;
+    stgs->dual_fix = true;
     Presolver *presolver =
         new_presolver(Ax, Ai, Ap, n_rows, n_cols, nnz, lhs, rhs, lbs, ubs, c, stgs);
 
@@ -631,19 +631,10 @@ static char *test_6_postsolve()
     run_presolver(presolver);
 
     // construct optimal primal solution to reduced problem (computed offline).
-    // on linux/windows vs mac the parallel column kept is different, leading to
-    // different postsolved solutions
-#if defined(__linux__) || defined(_WIN32)
-    double x[] = {-2., 12.5, 21., -8.};
-    double y[] = {0., 0., 0., 0., 0., 0., 0., 0.};
-    double z[] = {3., -4., -3., 1.};
-    double obj = 0.0;
-#else
-    double x[] = {-2., -8.33333333, -21., -8.};
+    double x[] = {-2., 10.5, -(8.0 + 1.0 / 3.0), -8.};
     double y[] = {0., 0., 0., 0., 0., 0., 0., 0.};
-    double z[] = {3., 6., 3., 1.};
+    double z[] = {3., -6., 6., 1.};
     double obj = 0.0;
-#endif
 
     postsolve(presolver, x, y, z);
 
@@ -697,18 +688,10 @@ static char *test_7_postsolve()
     run_presolver(presolver);
 
     // construct optimal primal solution to reduced problem (computed offline)
-#if defined(__linux__) || defined(_WIN32)
-    double x[] = {-2., 12.5, 21., -8.};
+    double x[] = {-2., 10.5, -(8.0 + 1.0 / 3.0), -8.};
     double y[] = {0., 0., 0., 0., 0.};
-    double z[] = {3., -4., -3., 1.};
+    double z[] = {3., -6., 6., 1.};
     double obj = 0.0;
-#else
-    double x[] = {-25., -2., 21., -8.};
-    double y[] = {0., 0., 0., 0., 0.};
-    double z[] = {2., 3., -3., 1.};
-    double obj = 0.0;
-
-#endif
 
     postsolve(presolver, x, y, z);
 
@@ -717,9 +700,6 @@ static char *test_7_postsolve()
     double correct_y[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
     double correct_z[] = {2., 3., -4., -3., -6., 6., 3., 1.};
 
-    // on mac another solution seems to be postsolved, because a different parallel
-    // column is kept. So for non-linux we just check the objective value (not
-    // feasibility)
     mu_assert("postsolve error",
               is_solution_correct(presolver->sol->x, correct_x, presolver->sol->y,
                                   correct_y, presolver->sol->z, correct_z, n_rows,
diff --git a/tests/test_radix_sort.h b/tests/test_radix_sort.h
new file mode 100644
index 00000000..a6ad1f8c
--- /dev/null
+++ b/tests/test_radix_sort.h
@@ -0,0 +1,471 @@
+#ifndef TEST_RADIX_SORT_H
+#define TEST_RADIX_SORT_H
+
+#include "radix_sort.h"
+
+#include "minunit.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static int counter_radix_sort = 0;
+
+// helper: check that rows are sorted by (sparsity_IDs[rows[i]],
+// coeff_hashes[rows[i]]) treated as unsigned uint32_t, matching radix sort behavior
+static int is_sorted_by_keys(const int *rows, int n, const int *sparsity_IDs,
+                             const int *coeff_hashes)
+{
+    for (int i = 1; i < n; i++)
+    {
+        unsigned int sp_prev = (unsigned int) sparsity_IDs[rows[i - 1]];
+        unsigned int sp_curr = (unsigned int) sparsity_IDs[rows[i]];
+        if (sp_prev > sp_curr)
+        {
+            return 0;
+        }
+        if (sp_prev == sp_curr)
+        {
+            unsigned int ch_prev = (unsigned int) coeff_hashes[rows[i - 1]];
+            unsigned int ch_curr = (unsigned int) coeff_hashes[rows[i]];
+            if (ch_prev > ch_curr)
+            {
+                return 0;
+            }
+        }
+    }
+    return 1;
+}
+
+// helper: check that sorted array is a permutation of 0..n-1
+static int is_permutation(const int *rows, int n)
+{
+    int *seen = (int *) calloc((size_t) n, sizeof(int));
+    if (!seen) return 0;
+    for (int i = 0; i < n; i++)
+    {
+        if (rows[i] < 0 || rows[i] >= n || seen[rows[i]])
+        {
+            free(seen);
+            return 0;
+        }
+        seen[rows[i]] = 1;
+    }
+    free(seen);
+    return 1;
+}
+
+/* Test 1: Empty input (n=0) should not crash */
+static char *test_1_radix_sort()
+{
+    int rows[1] = {0};
+    int sparsity_IDs[1] = {0};
+    int coeff_hashes[1] = {0};
+    int aux[1] = {0};
+
+    radix_sort_rows(rows, 0, sparsity_IDs, coeff_hashes, aux);
+    // just check it doesn't crash
+    return 0;
+}
+
+/* Test 2: Single element */
+static char *test_2_radix_sort()
+{
+    int rows[1] = {0};
+    int sparsity_IDs[1] = {42};
+    int coeff_hashes[1] = {99};
+    int aux[1];
+
+    radix_sort_rows(rows, 1, sparsity_IDs, coeff_hashes, aux);
+    mu_assert("single element should remain 0", rows[0] == 0);
+    return 0;
+}
+
+/* Test 3: Already sorted input (small, uses insertion sort path) */
+static char *test_3_radix_sort()
+{
+    int n = 5;
+    int rows[] = {0, 1, 2, 3, 4};
+    int sparsity_IDs[] = {1, 2, 3, 4, 5};
+    int coeff_hashes[] = {10, 20, 30, 40, 50};
+    int aux[5];
+
+    radix_sort_rows(rows, (size_t) n, sparsity_IDs, coeff_hashes, aux);
+    mu_assert("should be sorted",
+              is_sorted_by_keys(rows, n, sparsity_IDs, coeff_hashes));
+    mu_assert("should be permutation", is_permutation(rows, n));
+    return 0;
+}
+
+/* Test 4: Reverse sorted input (small, insertion sort path) */
+static char *test_4_radix_sort()
+{
+    int n = 5;
+    int rows[] = {0, 1, 2, 3, 4};
+    int sparsity_IDs[] = {50, 40, 30, 20, 10};
+    int coeff_hashes[] = {0, 0, 0, 0, 0};
+    int aux[5];
+
+    radix_sort_rows(rows, (size_t) n, sparsity_IDs, coeff_hashes, aux);
+    mu_assert("should be sorted",
+              is_sorted_by_keys(rows, n, sparsity_IDs, coeff_hashes));
+    mu_assert("should be permutation", is_permutation(rows, n));
+
+    // row 4 has smallest sparsity_ID (10) so should be first
+    mu_assert("first should be row 4", rows[0] == 4);
+    mu_assert("last should be row 0", rows[4] == 0);
+    return 0;
+}
+
+/* Test 5: Secondary key (coeff_hashes) breaks ties in primary key */
+static char *test_5_radix_sort()
+{
+    int n = 4;
+    int rows[] = {0, 1, 2, 3};
+    int sparsity_IDs[] = {100, 100, 100, 100}; // all same
+    int coeff_hashes[] = {40, 10, 30, 20};
+    int aux[4];
+
+    radix_sort_rows(rows, (size_t) n, sparsity_IDs, coeff_hashes, aux);
+    mu_assert("should be sorted",
+              is_sorted_by_keys(rows, n, sparsity_IDs, coeff_hashes));
+
+    // sorted by coeff_hash: 10(row1), 20(row3), 30(row2), 40(row0)
+    mu_assert("first", rows[0] == 1);
+    mu_assert("second", rows[1] == 3);
+    mu_assert("third", rows[2] == 2);
+    mu_assert("fourth", rows[3] == 0);
+    return 0;
+}
+
+/* Test 6: Negative sparsity_IDs treated as unsigned in radix sort path (n >= 256).
+   High bit set sorts after positive values. */
+static char *test_6_radix_sort()
+{
+    int n = 300;
+    int rows[300];
+    int sparsity_IDs[300];
+    int coeff_hashes[300];
+    int aux[300];
+
+    for (int i = 0; i < n; i++)
+    {
+        rows[i] = i;
+        // mix positive and negative sparsity_IDs
+        sparsity_IDs[i] = (i % 3 == 0) ? -(i + 1) : (i + 1);
+        coeff_hashes[i] = 0;
+    }
+
+    radix_sort_rows(rows, (size_t) n, sparsity_IDs, coeff_hashes, aux);
+    mu_assert("should be sorted",
+              is_sorted_by_keys(rows, n, sparsity_IDs, coeff_hashes));
+    mu_assert("should be permutation", is_permutation(rows, n));
+
+    // positive sparsity_IDs (unsigned < 0x80000000) should come before
+    // negative ones (unsigned >= 0x80000000)
+    int saw_negative = 0;
+    for (int i = 0; i < n; i++)
+    {
+        int sp = sparsity_IDs[rows[i]];
+        if (sp < 0) saw_negative = 1;
+        if (saw_negative && sp >= 0)
+        {
+            mu_assert("positive should not appear after negative in unsigned order",
+                      0);
+        }
+    }
+    return 0;
+}
+
+/* Test 7: Larger input (n=300) forces the radix sort path (threshold is 256) */
+static char *test_7_radix_sort()
+{
+    int n = 300;
+    int rows[300];
+    int sparsity_IDs[300];
+    int coeff_hashes[300];
+    int aux[300];
+
+    srand(12345);
+    for (int i = 0; i < n; i++)
+    {
+        rows[i] = i;
+        sparsity_IDs[i] = rand() % 50;
+        coeff_hashes[i] = rand() % 100;
+    }
+
+    radix_sort_rows(rows, (size_t) n, sparsity_IDs, coeff_hashes, aux);
+    mu_assert("should be sorted",
+              is_sorted_by_keys(rows, n, sparsity_IDs, coeff_hashes));
+    mu_assert("should be permutation", is_permutation(rows, n));
+    return 0;
+}
+
+/* Test 8: Large input with random keys including negative values */
+static char *test_8_radix_sort()
+{
+    int n = 1000;
+    int *rows = (int *) malloc((size_t) n * sizeof(int));
+    int *sparsity_IDs = (int *) malloc((size_t) n * sizeof(int));
+    int *coeff_hashes = (int *) malloc((size_t) n * sizeof(int));
+    int *aux = (int *) malloc((size_t) n * sizeof(int));
+
+    srand(9999);
+    for (int i = 0; i < n; i++)
+    {
+        rows[i] = i;
+        sparsity_IDs[i] = rand() - RAND_MAX / 2;
+        coeff_hashes[i] = rand() - RAND_MAX / 2;
+    }
+
+    radix_sort_rows(rows, (size_t) n, sparsity_IDs, coeff_hashes, aux);
+    mu_assert("should be sorted",
+              is_sorted_by_keys(rows, n, sparsity_IDs, coeff_hashes));
+    mu_assert("should be permutation", is_permutation(rows, n));
+
+    free(rows);
+    free(sparsity_IDs);
+    free(coeff_hashes);
+    free(aux);
+    return 0;
+}
+
+/* Test 9: All identical keys */
+static char *test_9_radix_sort()
+{
+    int n = 300;
+    int rows[300];
+    int sparsity_IDs[300];
+    int coeff_hashes[300];
+    int aux[300];
+
+    for (int i = 0; i < n; i++)
+    {
+        rows[i] = i;
+        sparsity_IDs[i] = 42;
+        coeff_hashes[i] = 77;
+    }
+
+    radix_sort_rows(rows, (size_t) n, sparsity_IDs, coeff_hashes, aux);
+    mu_assert("should be sorted",
+              is_sorted_by_keys(rows, n, sparsity_IDs, coeff_hashes));
+    mu_assert("should be permutation", is_permutation(rows, n));
+
+    // reverse stability: identical keys → higher original indices first
+    for (int i = 0; i < n; i++)
+    {
+        mu_assert("identical keys should be reverse-stable", rows[i] == n - 1 - i);
+    }
+    return 0;
+}
+
+/* Test 10: parallel_radix_sort_rows on a large array (exercises the parallel path)
+ */
+static char *test_10_radix_sort()
+{
+    int n = 200000;
+    int *rows = (int *) malloc((size_t) n * sizeof(int));
+    int *sparsity_IDs = (int *) malloc((size_t) n * sizeof(int));
+    int *coeff_hashes = (int *) malloc((size_t) n * sizeof(int));
+    int *aux = (int *) malloc((size_t) n * sizeof(int));
+
+    srand(77777);
+    for (int i = 0; i < n; i++)
+    {
+        rows[i] = i;
+        sparsity_IDs[i] = rand() - RAND_MAX / 2;
+        coeff_hashes[i] = rand() - RAND_MAX / 2;
+    }
+
+    parallel_radix_sort_rows(rows, (size_t) n, sparsity_IDs, coeff_hashes, aux);
+    mu_assert("should be sorted",
+              is_sorted_by_keys(rows, n, sparsity_IDs, coeff_hashes));
+    mu_assert("should be permutation", is_permutation(rows, n));
+
+    free(rows);
+    free(sparsity_IDs);
+    free(coeff_hashes);
+    free(aux);
+    return 0;
+}
+
+/* Test 11: parallel_radix_sort_rows with small input (falls back to sequential) */
+static char *test_11_radix_sort()
+{
+    int n = 50;
+    int rows[50];
+    int sparsity_IDs[50];
+    int coeff_hashes[50];
+    int aux[50];
+
+    srand(555);
+    for (int i = 0; i < n; i++)
+    {
+        rows[i] = i;
+        sparsity_IDs[i] = rand() % 10;
+        coeff_hashes[i] = rand() % 10;
+    }
+
+    parallel_radix_sort_rows(rows, (size_t) n, sparsity_IDs, coeff_hashes, aux);
+    mu_assert("should be sorted",
+              is_sorted_by_keys(rows, n, sparsity_IDs, coeff_hashes));
+    mu_assert("should be permutation", is_permutation(rows, n));
+    return 0;
+}
+
+/* Test 12: Keys with values near INT_MAX and INT_MIN (n >= 256 for radix path) */
+static char *test_12_radix_sort()
+{
+    int n = 300;
+    int rows[300];
+    int sparsity_IDs[300];
+    int coeff_hashes[300];
+    int aux[300];
+
+    // fill most with benign values
+    for (int i = 0; i < n; i++)
+    {
+        rows[i] = i;
+        sparsity_IDs[i] = i;
+        coeff_hashes[i] = 0;
+    }
+
+    // set some extreme values
+    sparsity_IDs[0] = 0;
+    sparsity_IDs[1] = 2147483647;  // INT_MAX = 0x7FFFFFFF
+    sparsity_IDs[2] = -2147483648; // INT_MIN = 0x80000000
+    sparsity_IDs[3] = -1;          // 0xFFFFFFFF
+
+    radix_sort_rows(rows, (size_t) n, sparsity_IDs, coeff_hashes, aux);
+    mu_assert("should be sorted",
+              is_sorted_by_keys(rows, n, sparsity_IDs, coeff_hashes));
+    mu_assert("should be permutation", is_permutation(rows, n));
+
+    // In unsigned order: row0(sp=0) should come before row1(sp=INT_MAX)
+    // which should come before row2(sp=INT_MIN as 0x80000000)
+    // which should come before row3(sp=-1 as 0xFFFFFFFF)
+    int pos_row0 = -1, pos_row1 = -1, pos_row2 = -1, pos_row3 = -1;
+    for (int i = 0; i < n; i++)
+    {
+        if (rows[i] == 0) pos_row0 = i;
+        if (rows[i] == 1) pos_row1 = i;
+        if (rows[i] == 2) pos_row2 = i;
+        if (rows[i] == 3) pos_row3 = i;
+    }
+    mu_assert("row0 before row1", pos_row0 < pos_row1);
+    mu_assert("row1 before row2", pos_row1 < pos_row2);
+    mu_assert("row2 before row3", pos_row2 < pos_row3);
+    return 0;
+}
+
+/* Test 13: Reverse stability on insertion sort path (n=6, groups with identical
+ * keys) */
+static char *test_13_radix_sort()
+{
+    int n = 6;
+    int rows[] = {0, 1, 2, 3, 4, 5};
+    // Group A (sp=10, ch=20): rows 0, 1, 2
+    // Group B (sp=10, ch=30): rows 3, 4, 5
+    int sparsity_IDs[] = {10, 10, 10, 10, 10, 10};
+    int coeff_hashes[] = {20, 20, 20, 30, 30, 30};
+    int aux[6];
+
+    radix_sort_rows(rows, (size_t) n, sparsity_IDs, coeff_hashes, aux);
+    mu_assert("should be sorted",
+              is_sorted_by_keys(rows, n, sparsity_IDs, coeff_hashes));
+    mu_assert("should be permutation", is_permutation(rows, n));
+
+    // Group A (ch=20) comes first, reverse-stable: 2, 1, 0
+    mu_assert("group A pos 0", rows[0] == 2);
+    mu_assert("group A pos 1", rows[1] == 1);
+    mu_assert("group A pos 2", rows[2] == 0);
+    // Group B (ch=30) next, reverse-stable: 5, 4, 3
+    mu_assert("group B pos 0", rows[3] == 5);
+    mu_assert("group B pos 1", rows[4] == 4);
+    mu_assert("group B pos 2", rows[5] == 3);
+    return 0;
+}
+
+/* Test 14: Reverse stability on radix sort path (n=300, groups with identical keys)
+ */
+static char *test_14_radix_sort()
+{
+    int n = 300;
+    int rows[300];
+    int sparsity_IDs[300];
+    int coeff_hashes[300];
+    int aux[300];
+
+    // 3 groups of 100 with identical keys within each group
+    for (int i = 0; i < n; i++)
+    {
+        rows[i] = i;
+        sparsity_IDs[i] = 5;
+        if (i < 100)
+            coeff_hashes[i] = 10;
+        else if (i < 200)
+            coeff_hashes[i] = 20;
+        else
+            coeff_hashes[i] = 30;
+    }
+
+    radix_sort_rows(rows, (size_t) n, sparsity_IDs, coeff_hashes, aux);
+    mu_assert("should be sorted",
+              is_sorted_by_keys(rows, n, sparsity_IDs, coeff_hashes));
+    mu_assert("should be permutation", is_permutation(rows, n));
+
+    // Within each group, reverse-stable: higher original indices first
+    // Group 0 (ch=10, rows 0..99) → should appear as 99, 98, ..., 0
+    for (int i = 0; i < 100; i++)
+    {
+        mu_assert("group 0 reverse-stable", rows[i] == 99 - i);
+    }
+    // Group 1 (ch=20, rows 100..199) → should appear as 199, 198, ..., 100
+    for (int i = 0; i < 100; i++)
+    {
+        mu_assert("group 1 reverse-stable", rows[100 + i] == 199 - i);
+    }
+    // Group 2 (ch=30, rows 200..299) → should appear as 299, 298, ..., 200
+    for (int i = 0; i < 100; i++)
+    {
+        mu_assert("group 2 reverse-stable", rows[200 + i] == 299 - i);
+    }
+    return 0;
+}
+
+static const char *all_tests_radix_sort()
+{
+    mu_run_test(test_1_radix_sort, counter_radix_sort);
+    mu_run_test(test_2_radix_sort, counter_radix_sort);
+    mu_run_test(test_3_radix_sort, counter_radix_sort);
+    mu_run_test(test_4_radix_sort, counter_radix_sort);
+    mu_run_test(test_5_radix_sort, counter_radix_sort);
+    mu_run_test(test_6_radix_sort, counter_radix_sort);
+    mu_run_test(test_7_radix_sort, counter_radix_sort);
+    mu_run_test(test_8_radix_sort, counter_radix_sort);
+    mu_run_test(test_9_radix_sort, counter_radix_sort);
+    mu_run_test(test_10_radix_sort, counter_radix_sort);
+    mu_run_test(test_11_radix_sort, counter_radix_sort);
+    mu_run_test(test_12_radix_sort, counter_radix_sort);
+    mu_run_test(test_13_radix_sort, counter_radix_sort);
+    mu_run_test(test_14_radix_sort, counter_radix_sort);
+
+    return 0;
+}
+
+int test_radix_sort()
+{
+    const char *result = all_tests_radix_sort();
+    if (result != 0)
+    {
+        printf("%s\n", result);
+        printf("radix_sort: TEST FAILED!\n");
+    }
+    else
+    {
+        printf("radix_sort: ALL TESTS PASSED\n");
+    }
+    printf("radix_sort: Tests run: %d\n", counter_radix_sort);
+    return result == 0;
+}
+
+#endif // TEST_RADIX_SORT_H
diff --git a/tests/test_ston.h b/tests/test_ston.h
index 621de781..1be6c151 100644
--- a/tests/test_ston.h
+++ b/tests/test_ston.h
@@ -4,6 +4,7 @@
 #include "Debugger.h"
 #include "PSLP_API.h"
 
+#include "Simple_dual_fix.h"
 #include "StonCols.h"
 #include "debug_macros.h"
 #include "minunit.h"
@@ -932,6 +933,7 @@ static char *test_12_ston()
     Constraints *constraints = prob->constraints;
     Matrix *A = constraints->A;
     remove_ston_cols(prob);
+    simple_dual_fix(prob);
     problem_clean(prob, true);
 
     mu_assert("error",
@@ -1015,6 +1017,7 @@ static char *test_13_ston()
     Constraints *constraints = prob->constraints;
     Matrix *A = constraints->A;
     remove_ston_cols(prob);
+    simple_dual_fix(prob);
     problem_clean(prob, true);
 
     mu_assert("error",