diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index c6afdcaf..837a2b75 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -3,10 +3,10 @@ name: CI
 on:
   push:
     branches:
-    - master
+    - "*"
   pull_request:
     branches:
-    - master
+    - "*"
 
 jobs:
   build:
@@ -42,7 +42,7 @@ jobs:
       uses: uraimo/run-on-arch-action@v2
       with:
         arch: aarch64
-        distro: ubuntu20.04
+        distro: ubuntu22.04
         githubToken: ${{ github.token }}
         dockerRunArgs: |
           --volume "${PWD}:/beagle-lib"
diff --git a/examples/hmctest/hmctest.cpp b/examples/hmctest/hmctest.cpp
index ac3dba6b..056d9e7e 100755
--- a/examples/hmctest/hmctest.cpp
+++ b/examples/hmctest/hmctest.cpp
@@ -165,8 +165,8 @@ int main( int argc, const char* argv[] )
 	int nPatterns = strlen(human) * nRepeats;
 
     // change # rate category to 2
-//    int rateCategoryCount = 4;
-    int rateCategoryCount = 1;
+    int rateCategoryCount = 2;
+//    int rateCategoryCount = 1;
 
     int scaleCount = (scaling ? 7 : 0);
 
@@ -207,6 +207,11 @@ int main( int argc, const char* argv[] )
         requirementFlags |= BEAGLE_FLAG_VECTOR_NONE;
     }
 
+    int threadCount = 4;
+    if (threadCount > 1) {
+        requirementFlags |= BEAGLE_FLAG_THREADING_CPP;
+    }
+
     // create an instance of the BEAGLE library
 	int instance = beagleCreateInstance(
                                   3,				/**< Number of tip data elements (input) */
@@ -228,6 +233,10 @@ int main( int argc, const char* argv[] )
 	    exit(1);
     }
 
+    if (threadCount > 1) {
+        beagleSetCPUThreadCount(instance, threadCount);
+    }
+
 
     int rNumber = instDetails.resourceNumber;
     fprintf(stdout, "Using resource %i:\n", rNumber);
@@ -275,10 +284,10 @@ int main( int argc, const char* argv[] )
 ////        rates[i] = 3.0 * (i + 1) / (2 * rateCategoryCount + 1);
 //    }
 
-//    rates[0] = 0.14251623900062188;
-//    rates[1] = 1.857483760999378;
+    rates[0] = 0.14251623900062188;
+    rates[1] = 1.857483760999378;
 
-    rates[0] = 1.0;
+//    rates[0] = 1.0;
 
 	beagleSetCategoryRates(instance, &rates[0]);
 
diff --git a/libhmsbeagle/CPU/BeagleCPU4StateImpl.h b/libhmsbeagle/CPU/BeagleCPU4StateImpl.h
index 56c4ce8e..3a74f4b1 100644
--- a/libhmsbeagle/CPU/BeagleCPU4StateImpl.h
+++ b/libhmsbeagle/CPU/BeagleCPU4StateImpl.h
@@ -111,25 +111,27 @@ class BeagleCPU4StateImpl : public BeagleCPUImpl<BEAGLE_CPU_GENERIC> {
                                        int endPattern);
 
     virtual void calcEdgeLogDerivativesStates(const int *tipStates,
-                                             const REALTYPE *preOrderPartial,
-                                             const int firstDerivativeIndex,
-                                             const int secondDerivativeIndex,
-                                             const double *categoryRates,
-                                             const REALTYPE *categoryWeights,
-                                             double *outDerivatives,
-                                             double *outSumDerivatives,
-                                             double *outSumSquaredDerivatives);
+                                              const REALTYPE *preOrderPartial,
+                                              const int firstDerivativeIndex,
+                                              const int secondDerivativeIndex,
+                                              const double *categoryRates,
+                                              const REALTYPE *categoryWeights,
+                                              double *outDerivatives,
+                                              double *outSumDerivatives,
+                                              double *outSumSquaredDerivatives,
+                                              int cacheOffset);
 
     virtual void calcEdgeLogDerivativesPartials(const REALTYPE *postOrderPartial,
-                                               const REALTYPE *preOrderPartial,
-                                               const int firstDerivativeIndex,
-                                               const int secondDerivativeIndex,
-                                               const double *categoryRates,
-                                               const REALTYPE *categoryWeights,
-                                               const int scalingFactorsIndex,
-                                               double *outDerivatives,
-                                               double *outSumDerivatives,
-                                               double *outSumSquaredDerivatives);
+                                                const REALTYPE *preOrderPartial,
+                                                const int firstDerivativeIndex,
+                                                const int secondDerivativeIndex,
+                                                const double *categoryRates,
+                                                const REALTYPE *categoryWeights,
+                                                const int scalingFactorsIndex,
+                                                double *outDerivatives,
+                                                double *outSumDerivatives,
+                                                double *outSumSquaredDerivatives,
+                                                int cacheOffset);
 
     virtual void calcCrossProductsStates(const int *tipStates,
                                          const REALTYPE *preOrderPartial,
diff --git a/libhmsbeagle/CPU/BeagleCPU4StateImpl.hpp b/libhmsbeagle/CPU/BeagleCPU4StateImpl.hpp
index e6988194..b2614595 100644
--- a/libhmsbeagle/CPU/BeagleCPU4StateImpl.hpp
+++ b/libhmsbeagle/CPU/BeagleCPU4StateImpl.hpp
@@ -1008,7 +1008,8 @@ void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogDerivativesStates(const
                                                                            const REALTYPE *categoryWeights,
                                                                            double *outDerivatives,
                                                                            double *outSumDerivatives,
-                                                                           double *outSumSquaredDerivatives) {
+                                                                           double *outSumSquaredDerivatives,
+                                                                           int cacheOffset) {
 
     for (int category = 0; category < kCategoryCount; category++) {
 
@@ -1031,8 +1032,8 @@ void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogDerivativesStates(const
 
             REALTYPE denominator = preOrderPartial[localPatternOffset + (state & 3)];
 
-            grandNumeratorDerivTmp[pattern] += categoryWeights[category] * numerator;
-            grandDenominatorDerivTmp[pattern] += categoryWeights[category] * denominator;
+            grandNumeratorDerivTmp[cacheOffset + pattern] += categoryWeights[category] * numerator;
+            grandDenominatorDerivTmp[cacheOffset + pattern] += categoryWeights[category] * denominator;
         }
     }
 }
@@ -1047,7 +1048,8 @@ void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogDerivativesPartials(con
                                                                              const int scalingFactorsIndex,
                                                                              double *outDerivatives,
                                                                              double *outSumDerivatives,
-                                                                             double *outSumSquaredDerivatives) {
+                                                                             double *outSumSquaredDerivatives,
+                                                                             int cacheOffset) {
 
     const REALTYPE* transMatrix = gTransitionMatrices[firstDerivativeIndex];
 
@@ -1073,8 +1075,8 @@ void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogDerivativesPartials(con
 //            grandDenominatorDerivTmp[k] += (postPartials[v] * prePartials[v] + postPartials[v + 1] * prePartials[v + 1]
 //                    + postPartials[v + 2] * prePartials[v + 2] + postPartials[v + 3] * prePartials[v + 3]) * weight;
 
-            grandDenominatorDerivTmp[k] += (p10 * p00 + p11 * p01 + p12 * p02 + p13 * p03) * weight;
-            grandNumeratorDerivTmp[k] += (sum10 * p00 + sum11 * p01 + sum12 * p02 + sum13 * p03) * weight;
+            grandDenominatorDerivTmp[cacheOffset + k] += (p10 * p00 + p11 * p01 + p12 * p02 + p13 * p03) * weight;
+            grandNumeratorDerivTmp[cacheOffset + k] += (sum10 * p00 + sum11 * p01 + sum12 * p02 + sum13 * p03) * weight;
 
             v += 4;
         }
diff --git a/libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.h b/libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.h
index d1881311..348ffc72 100644
--- a/libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.h
+++ b/libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.h
@@ -165,26 +165,27 @@ class BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE> : public BeagleCPU4StateIm
 protected:
     virtual int getPaddedPatternsModulus();
 
-	virtual void accumulateDerivatives(double* outDerivatives,
-									   double* outSumDerivatives,
-									   double* outSumSquaredDerivatives);
+	virtual void accumulateDerivatives(double *outDerivatives, double *outSumDerivatives,
+                                       double *outSumSquaredDerivatives, int offset);
 
 private:
 
 	template <bool DoDerivatives>
-	void accumulateDerivativesDispatch1(double* outDerivatives,
-										double* outSumDerivatives,
-										double* outSumSquaredDerivatives);
+    void
+    accumulateDerivativesDispatch1(double *outDerivatives, double *outSumDerivatives, double *outSumSquaredDerivatives,
+                                   int offset);
 
 	template <bool DoDerivatives, bool DoSum>
-	void accumulateDerivativesDispatch2(double* outDerivatives,
-										double* outSumDerivatives,
-										double* outSumSquaredDerivatives);
+    void accumulateDerivativesDispatch2(double *outDerivatives,
+                                        double *outSumDerivatives,
+                                        double *outSumSquaredDerivatives,
+                                        int offset);
 
 	template <bool DoDerivatives, bool DoSum, bool DoSumSquared>
-	void accumulateDerivativesImpl(double* outDerivatives,
-								   double* outSumDerivatives,
-								   double* outSumSquaredDerivatives);
+    void accumulateDerivativesImpl(double *outDerivatives,
+                                   double *outSumDerivatives,
+                                   double *outSumSquaredDerivatives,
+                                   int offset);
 
     virtual void calcStatesStates(double* destP,
                                   const int* states1,
@@ -260,7 +261,8 @@ class BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE> : public BeagleCPU4StateIm
                                                 const int scalingFactorsIndex,
                                                 double* outDerivatives,
                                                 double* outSumDerivatives,
-                                                double* outSumSquaredDerivatives);
+                                                double* outSumSquaredDerivatives,
+												int cacheOffset);
 
     virtual void calcEdgeLogDerivativesStates(const int* tipStates,
                                               const double *__restrict preOrderPartial,
@@ -270,7 +272,8 @@ class BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE> : public BeagleCPU4StateIm
                                               const double* __restrict categoryWeights,
                                               double *outDerivatives,
                                               double *outSumDerivatives,
-                                              double *outSumSquaredDerivatives);
+                                              double *outSumSquaredDerivatives,
+											  int cacheOffset);
 
     virtual void calcPartialsPartialsFixedScaling(double* __restrict destP,
                                                   const double* __restrict child0Partials,
diff --git a/libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.hpp b/libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.hpp
index 52f3b6f8..69dc8ce9 100644
--- a/libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.hpp
+++ b/libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.hpp
@@ -639,10 +639,10 @@ void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcCrossProductsStates(co
 }
 
 BEAGLE_CPU_4_SSE_TEMPLATE template <bool DoDerivatives, bool DoSum, bool DoSumSquared>
-void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::accumulateDerivativesImpl(
-        double* outDerivatives,
-        double* outSumDerivatives,
-        double* outSumSquaredDerivatives) {
+void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::accumulateDerivativesImpl(double *outDerivatives,
+                                                                                double *outSumDerivatives,
+                                                                                double *outSumSquaredDerivatives,
+                                                                                int offset) {
 
     V_Real vSum = VEC_SETZERO();
     V_Real vSumSquared = VEC_SETZERO();
@@ -650,8 +650,8 @@ void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::accumulateDerivativesImpl(
     int k = 0;
     for (; k < kPatternCount - 1; k += 2) {
 
-        V_Real numerator = VEC_LOAD(grandNumeratorDerivTmp + k);
-        V_Real denominator = VEC_LOAD(grandDenominatorDerivTmp + k);
+        V_Real numerator = VEC_LOAD(grandNumeratorDerivTmp + k + offset);
+        V_Real denominator = VEC_LOAD(grandDenominatorDerivTmp + k + offset);
         V_Real derivative = VEC_DIV(numerator, denominator);
         V_Real patternWeight = VEC_LOAD(gPatternWeights + k);
 
@@ -681,7 +681,7 @@ void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::accumulateDerivativesImpl(
     }
 
     for (; k < kPatternCount; ++k) {
-        double derivative = grandNumeratorDerivTmp[k] / grandDenominatorDerivTmp[k];
+        double derivative = grandNumeratorDerivTmp[k + offset] / grandDenominatorDerivTmp[k + offset];
         if (DoDerivatives) {
             outDerivatives[k] = derivative;
         }
@@ -703,46 +703,47 @@ void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::accumulateDerivativesImpl(
 }
 
 BEAGLE_CPU_4_SSE_TEMPLATE template <bool DoDerivatives, bool DoSum>
-void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::accumulateDerivativesDispatch2(
-        double* outDerivatives,
-        double* outSumDerivatives,
-        double* outSumSquaredDerivatives) {
+void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::accumulateDerivativesDispatch2(double *outDerivatives,
+                                                                                     double *outSumDerivatives,
+                                                                                     double *outSumSquaredDerivatives,
+                                                                                     int offset) {
 
     if (outSumSquaredDerivatives == NULL) {
         accumulateDerivativesImpl<DoDerivatives, DoSum, false>(
-                outDerivatives, outSumDerivatives, outSumSquaredDerivatives);
+                outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset);
     } else {
         accumulateDerivativesImpl<DoDerivatives, DoSum, true>(
-                outDerivatives, outSumDerivatives, outSumSquaredDerivatives);
+                outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset);
     }
 }
 
 BEAGLE_CPU_4_SSE_TEMPLATE template <bool DoDerivatives>
-void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::accumulateDerivativesDispatch1(
-        double* outDerivatives,
-        double* outSumDerivatives,
-        double* outSumSquaredDerivatives) {
+void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::accumulateDerivativesDispatch1(double *outDerivatives,
+                                                                                     double *outSumDerivatives,
+                                                                                     double *outSumSquaredDerivatives,
+                                                                                     int offset) {
 
     if (outSumDerivatives == NULL) {
         accumulateDerivativesDispatch2<DoDerivatives, false>(
-                outDerivatives, outSumDerivatives, outSumSquaredDerivatives);
+                outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset);
     } else {
         accumulateDerivativesDispatch2<DoDerivatives, true>(
-                outDerivatives, outSumDerivatives, outSumSquaredDerivatives);
+                outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset);
     }
 }
 
 
 BEAGLE_CPU_4_SSE_TEMPLATE
-void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::accumulateDerivatives(double* outDerivatives,
-                                                                            double* outSumDerivatives,
-                                                                            double* outSumSquaredDerivatives) {
+void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::accumulateDerivatives(double *outDerivatives,
+                                                                            double *outSumDerivatives,
+                                                                            double *outSumSquaredDerivatives,
+                                                                            int offset) {
     if (outDerivatives == NULL) {
         accumulateDerivativesDispatch1<false>(
-                outDerivatives, outSumDerivatives, outSumSquaredDerivatives);
+                outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset);
     } else {
         accumulateDerivativesDispatch1<true>(
-                outDerivatives, outSumDerivatives, outSumSquaredDerivatives);
+                outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset);
     }
 }
 
@@ -756,7 +757,8 @@ void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcEdgeLogDerivativesPart
                                                                                      const int scalingFactorsIndex,
                                                                                      double* outDerivatives,
                                                                                      double* outSumDerivatives,
-                                                                                     double* outSumSquaredDerivatives) {
+                                                                                     double* outSumSquaredDerivatives,
+                                                                                     int cacheOffset) {
     double* cl_p = integrationTmp;
     memset(cl_p, 0, (kPatternCount * kStateCount)*sizeof(double));
 
@@ -809,8 +811,8 @@ void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcEdgeLogDerivativesPart
             double numer = _mm_cvtsd_f64(vnumer) * wt[l];
             double denon = _mm_cvtsd_f64(vdenom) * wt[l];
 
-            grandNumeratorDerivTmp[k] += numer; // TODO Merge [numer, denom] into single SSE transactions
-            grandDenominatorDerivTmp[k] += denon;
+            grandNumeratorDerivTmp[cacheOffset + k] += numer; // TODO Merge [numer, denom] into single SSE transactions
+            grandDenominatorDerivTmp[cacheOffset + k] += denon;
 
             v += 4;
         }
@@ -831,7 +833,8 @@ void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcEdgeLogDerivativesStat
                                                                                    const double* categoryWeights,
                                                                                    double *outDerivatives,
                                                                                    double *outSumDerivatives,
-                                                                                   double *outSumSquaredDerivatives) {
+                                                                                   double *outSumSquaredDerivatives,
+                                                                                   int cacheOffset) {
     double* cl_p = integrationTmp;
     memset(cl_p, 0, (kPatternCount * kStateCount)*sizeof(double));
 
@@ -866,8 +869,8 @@ void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcEdgeLogDerivativesStat
             double numer = _mm_cvtsd_f64(vnumer);
             double denom = cl_r[stateChild & 3]; cl_r += 4;
 
-            grandNumeratorDerivTmp[k] += numer * wt[l];
-            grandDenominatorDerivTmp[k] += denom * wt[l];
+            grandNumeratorDerivTmp[cacheOffset + k] += numer * wt[l];
+            grandDenominatorDerivTmp[cacheOffset + k] += denom * wt[l];
         }
         w += OFFSET*4;
         vcl_r += 2 * kExtraPatterns;
diff --git a/libhmsbeagle/CPU/BeagleCPUImpl.h b/libhmsbeagle/CPU/BeagleCPUImpl.h
index 04d82974..b9751cbb 100644
--- a/libhmsbeagle/CPU/BeagleCPUImpl.h
+++ b/libhmsbeagle/CPU/BeagleCPUImpl.h
@@ -45,8 +45,8 @@
 
 //  TODO: assess following cut-offs dynamically
 #define BEAGLE_CPU_ASYNC_HW_THREAD_COUNT_THRESHOLD     16  // CPU category threshold
-#define BEAGLE_CPU_ASYNC_MIN_PATTERN_COUNT_LOW        256  // do not use CPU auto-threading for problems with fewer patterns on CPUs with many cores
-#define BEAGLE_CPU_ASYNC_MIN_PATTERN_COUNT_HIGH       768  // do not use CPU auto-threading for problems with fewer patterns on CPUs with few cores
+#define BEAGLE_CPU_ASYNC_MIN_PATTERN_COUNT_LOW         256 // do not use CPU auto-threading for problems with fewer patterns on CPUs with many cores
+#define BEAGLE_CPU_ASYNC_MIN_PATTERN_COUNT_HIGH        768 // do not use CPU auto-threading for problems with fewer patterns on CPUs with few cores
 #define BEAGLE_CPU_ASYNC_LIMIT_PATTERN_COUNT       262144  // do not use all CPU cores for problems with fewer patterns
 
 namespace beagle {
@@ -462,6 +462,13 @@ class BeagleCPUImpl : public BeagleImpl {
                               int count,
                               int cumulativeScaleIndex);
 
+
+    virtual int calcEdgeLogDerivativesByAutoPartitionAsync(const int *operations,
+                                                           int count,
+                                                           double *siteLogLikelihoods,
+                                                           double *outLogFirstDerivatives,
+                                                           double *outLogDiagonalSecondDerivatives);
+
     virtual int calcEdgeLogDerivatives(const int *postBufferIndices,
                                        const int *preBufferIndices,
                                        const int *firstDerivativeIndices,
@@ -474,16 +481,15 @@ class BeagleCPUImpl : public BeagleImpl {
                                        double *outLogFirstDerivatives,
                                        double *outLogDiagonalSecondDerivatives);
 
-    virtual void calcEdgeLogDerivativesStates(const int *tipStates,
-                                              const REALTYPE *preOrderPartial,
+    virtual void calcEdgeLogDerivativesStates(const int *tipStates, const REALTYPE *preOrderPartial,
                                               const int firstDerivativeIndex,
                                               const int secondDerivativeIndex,
                                               const double *categoryRates,
                                               const REALTYPE *categoryWeights,
-//                                              const REALTYPE *cumulativeScaleBuffer,
                                               double *siteLogLikelihoods,
                                               double *outLogFirstDerivatives,
-                                              double *outLogDiagonalSecondDerivatives);
+                                              double *outLogDiagonalSecondDerivatives,
+                                              int cacheOffset);
 
     virtual void calcEdgeLogDerivativesPartials(const REALTYPE *postOrderPartial,
                                                 const REALTYPE *preOrderPartial,
@@ -492,10 +498,10 @@ class BeagleCPUImpl : public BeagleImpl {
                                                 const double *categoryRates,
                                                 const REALTYPE *categoryWeights,
                                                 const int scalingFactorsIndex,
-//                                                const REALTYPE *cumulativeScaleBuffer,
                                                 double *siteLogLikelihoods,
                                                 double *outLogFirstDerivatives,
-                                                double *outLogDiagonalSecondDerivatives);
+                                                double *outLogDiagonalSecondDerivatives,
+                                                int cacheOffset);
 
     virtual int calcCrossProducts(const int *postBufferIndices,
                                   const int *preBufferIndices,
@@ -522,11 +528,10 @@ class BeagleCPUImpl : public BeagleImpl {
                                            double *outCrossProducts,
                                            double *outSumSquaredDerivatives);
 
-    virtual void resetDerivativeTemporaries();
+    virtual void resetDerivativeTemporaries(int offset);
 
-    virtual void accumulateDerivatives(double* outDerivatives,
-                               double* outSumDerivatives,
-                               double* outSumSquaredDerivatives);
+    virtual void accumulateDerivatives(double *outDerivatives, double *outSumDerivatives,
+                                       double *outSumSquaredDerivatives, int offset);
 
     virtual void autoPartitionPartialsOperations(const int* operations,
                                                  int* partitionOperations,
@@ -772,19 +777,17 @@ class BeagleCPUImpl : public BeagleImpl {
 private:
 
     template <bool DoDerivatives>
-    void accumulateDerivativesDispatch1(double* outDerivatives,
-                               double* outSumDerivatives,
-                               double* outSumSquaredDerivatives);
+    void
+    accumulateDerivativesDispatch1(double *outDerivatives, double *outSumDerivatives, double *outSumSquaredDerivatives,
+                                   int offset);
 
     template <bool DoDerivatives, bool DoSum>
-    void accumulateDerivativesDispatch2(double* outDerivatives,
-                               double* outSumDerivatives,
-                               double* outSumSquaredDerivatives);
+    void accumulateDerivativesDispatch2(double *outDerivatives, double *outSumDerivatives,
+                                        double *outSumSquaredDerivatives, int offset);
 
     template <bool DoDerivatives, bool DoSum, bool DoSumSquared>
-    void accumulateDerivativesImpl(double* outDerivatives,
-                               double* outSumDerivatives,
-                               double* outSumSquaredDerivatives);
+    void accumulateDerivativesImpl(double *outDerivatives, double *outSumDerivatives,
+                                   double *outSumSquaredDerivatives, int offset);
 };
 
 BEAGLE_CPU_FACTORY_TEMPLATE
diff --git a/libhmsbeagle/CPU/BeagleCPUImpl.hpp b/libhmsbeagle/CPU/BeagleCPUImpl.hpp
index 8cf0349d..0e48881a 100644
--- a/libhmsbeagle/CPU/BeagleCPUImpl.hpp
+++ b/libhmsbeagle/CPU/BeagleCPUImpl.hpp
@@ -809,6 +809,11 @@ int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setPatternPartitions(int partitionCount,
     assert(inPatternPartitions != 0L);
 
     kPartitionCount = partitionCount;
+    grandDenominatorDerivTmp = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * (kPaddedPatternCount + 1) * kPartitionCount);
+    grandNumeratorDerivTmp = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * (kPaddedPatternCount + 1)* kPartitionCount);
+    firstDerivTmp =  (REALTYPE*) mallocAligned(sizeof(REALTYPE) * kPaddedPatternCount * kStateCount * kPartitionCount);
+    secondDerivTmp =  (REALTYPE*) mallocAligned(sizeof(REALTYPE) * kPaddedPatternCount * kStateCount * kPartitionCount);
+
 
     if (!kPartitionsInitialised) {
         gPatternPartitions = (int*) malloc(sizeof(int) * kPatternCount);
@@ -1329,10 +1334,13 @@ int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::updatePrePartials(const int *operations,
                                         cumulativeScaleIndex);
         count *= kPartitionCount;
         returnCode = upPrePartialsByPartitionAsync((const int*) gAutoPartitionOperations,
-                                                   count);
+                                                count);
     } else {
         bool byPartition = false;
-        returnCode = upPrePartials(byPartition, operations, count, cumulativeScaleIndex);
+        returnCode = upPrePartials(byPartition,
+                                   operations,
+                                   count,
+                                   cumulativeScaleIndex);
     }
 
     return returnCode;
@@ -1373,13 +1381,111 @@ int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calculateEdgeDerivatives(const int *postB
                                                                    double *outDerivatives,
                                                                    double *outSumDerivatives,
                                                                    double *outSumSquaredDerivatives) {
-    return calcEdgeLogDerivatives(
-            postBufferIndices, preBufferIndices,
-            derivativeMatrixIndices, NULL,
-            categoryWeightsIndices,categoryRatesIndices, cumulativeScaleIndices,
-            count,
-            outDerivatives,
-            outSumDerivatives, outSumSquaredDerivatives);
+    int returnCode = BEAGLE_ERROR_GENERAL;
+
+    if (kAutoPartitioningEnabled && kPartitionCount > 1) {
+
+
+
+        int numOps  = BEAGLE_OP_COUNT;
+        int numOpsP = BEAGLE_PARTITION_OP_COUNT;
+
+        int perThreadCount = (count - (count % kPartitionCount)) / kPartitionCount;
+
+#ifdef BEAGLE_DEBUG_FLOW
+        std::cerr<<"Mark 0" << ", Original count = "<< count << ", perThreadCount = " << perThreadCount <<std::endl;
+#endif
+
+
+        if (gThreadOpCounts == NULL) {
+            memset(gThreadOpCounts, 0, sizeof(int) * kNumThreads);
+        }
+
+
+
+        std::fill(gThreadOpCounts, gThreadOpCounts + kNumThreads, 0);
+
+#ifdef BEAGLE_DEBUG_FLOW
+        //        std::cerr<<"Mark 1"<<std::endl;
+#endif
+
+        if (gThreadOperations == NULL) {
+
+#ifdef BEAGLE_DEBUG_FLOW
+            std::cerr<<"Mark 1.5"<<std::endl;
+#endif
+            gThreadOperations = (int**) malloc(sizeof(int*) * kNumThreads);
+        }
+
+        for (int i=0; i<kPartitionCount; i++) {
+            if (gThreadOperations[i] == NULL) {
+                gThreadOperations[i] = (int*) malloc(sizeof(int) * BEAGLE_PARTITION_OP_COUNT * kBufferCount * kPartitionCount);
+            }
+        }
+
+#ifdef BEAGLE_DEBUG_FLOW
+        std::cerr<<"Mark 2"<<std::endl;
+#endif
+
+        for (int i = 0; i < count; i++) {
+            const int partitionIndex = (i * kPartitionCount) / count;
+
+            gThreadOperations[partitionIndex][gThreadOpCounts[partitionIndex] * numOpsP] = postBufferIndices[i];
+            gThreadOperations[partitionIndex][gThreadOpCounts[partitionIndex] * numOpsP + 1] = preBufferIndices[i];
+            gThreadOperations[partitionIndex][gThreadOpCounts[partitionIndex] * numOpsP + 2] = derivativeMatrixIndices[i];
+            gThreadOperations[partitionIndex][gThreadOpCounts[partitionIndex] * numOpsP + 3] = categoryWeightsIndices[0];
+            gThreadOperations[partitionIndex][gThreadOpCounts[partitionIndex] * numOpsP + 4] = i;
+            gThreadOperations[partitionIndex][gThreadOpCounts[partitionIndex] * numOpsP + 5] = partitionIndex;
+            gThreadOpCounts[partitionIndex]++;
+
+        }
+
+
+#ifdef BEAGLE_DEBUG_FLOW
+        std::cerr<<"Mark 3"<<std::endl;
+#endif
+
+//
+//        count *= kPartitionCount;
+//        returnCode = upPartialsByPartitionAsync((const int*) gAutoPartitionOperations,
+//                                                count);
+
+        for (int i=0; i<kPartitionCount; i++) {
+            std::packaged_task<void()> threadTask(
+                    std::bind(&BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogDerivativesByAutoPartitionAsync, this,
+                              (const int*) gThreadOperations[i],
+                              gThreadOpCounts[i],
+                              outDerivatives,
+                              outSumDerivatives,
+                              outSumSquaredDerivatives));
+
+            gFutures[i] = threadTask.get_future();
+            threadData* td = &gThreads[i];
+
+            std::unique_lock<std::mutex> l(td->m);
+            td->jobs.push(std::move(threadTask));
+            l.unlock();
+
+            gThreads[i].cv.notify_one();
+        }
+
+        for (int i=0; i<kNumThreads; i++) {
+            gFutures[i].wait();
+        }
+
+        returnCode = BEAGLE_SUCCESS;
+
+    } else {
+        returnCode = calcEdgeLogDerivatives(
+                postBufferIndices, preBufferIndices,
+                derivativeMatrixIndices, NULL,
+                categoryWeightsIndices, categoryRatesIndices, cumulativeScaleIndices,
+                count,
+                outDerivatives,
+                outSumDerivatives, outSumSquaredDerivatives);
+    }
+
+    return returnCode;
 }
 
         BEAGLE_CPU_TEMPLATE
@@ -1428,9 +1534,8 @@ int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::updatePrePartialsByPartition(const int* o
     int returnCode = BEAGLE_ERROR_GENERAL;
 
     if (kThreadingEnabled) {
-//        TODO
-//        returnCode = upPrePartialsByPartitionAsync(operations,
-//                                                   count);
+        returnCode = upPrePartialsByPartitionAsync(operations,
+                                                   count);
     } else {
         bool byPartition = true;
         returnCode = upPrePartials(byPartition,
@@ -1987,6 +2092,100 @@ int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcCrossProducts(const int *postBufferIn
     return returnCode;
 }
 
+BEAGLE_CPU_TEMPLATE
+int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogDerivativesByAutoPartitionAsync(const int *operations,
+                                                                                  int count,
+                                                                                  double *outDerivatives,
+                                                                                  double *outSumDerivatives,
+                                                                                  double *outSumSquaredDerivatives) {
+
+    int returnCode = BEAGLE_SUCCESS;
+
+    int numOps  = BEAGLE_OP_COUNT;
+    int numOpsP = BEAGLE_PARTITION_OP_COUNT;
+
+    const int secondDerivativeIndex = BEAGLE_OP_NONE;
+    const double *categoryRates = NULL; // gCategoryRates[categoryRatesIndices[0]]; // TODO Generalize
+    const REALTYPE *categoryWeights = gCategoryWeights[operations[3]]; // TODO Generalize
+
+
+
+    for (int nodeNum = 0; nodeNum < count; nodeNum++) {
+
+        const REALTYPE *preOrderPartial = gPartials[operations[nodeNum * numOpsP + 1]];
+        const int *tipStates = gTipStates[operations[nodeNum * numOpsP]];
+
+        const int firstDerivativeIndex = operations[nodeNum * numOpsP + 2];
+        const int scalingFactorsIndex = -1; // cumulativeScaleIndices[nodeNum];
+
+        const int patternOffset = operations[nodeNum * numOpsP + 4] * kPatternCount;
+        const int threadOffset = operations[nodeNum * numOpsP + 5] * ((kPaddedPatternCount + 1) / 2 * 2);
+
+
+#ifdef BEAGLE_DEBUG_FLOW
+        std::cerr<<"Job = " << operations[nodeNum * numOpsP + 4] <<std::endl;
+        std::cerr<<"Post index = " << operations[nodeNum * numOpsP] <<std::endl;
+        std::cerr<<"Pre index = " << operations[nodeNum * numOpsP + 1] <<std::endl;
+        std::cerr<<"First Derivative index = " << operations[nodeNum * numOpsP + 2] <<std::endl;
+#endif
+
+
+
+        double* outDerivativesForNode = (outDerivatives == NULL) ?
+                                        NULL : outDerivatives + patternOffset;
+        double* outSumDerivativesForNode = (outSumDerivatives == NULL) ?
+                                           NULL : outSumDerivatives + operations[nodeNum * numOpsP + 4];
+        double* outSumSquaredDerivativesForNode = (outSumSquaredDerivatives == NULL) ?
+                                                  NULL : outSumSquaredDerivatives + operations[nodeNum * numOpsP + 4];
+
+        resetDerivativeTemporaries(threadOffset);
+
+#ifdef BEAGLE_DEBUG_FLOW
+        std::cerr<<"Almost almost finished Job = " << operations[nodeNum * numOpsP + 4] <<std::endl;
+#endif
+
+        if (tipStates != NULL) {
+
+            calcEdgeLogDerivativesStates(tipStates, preOrderPartial, firstDerivativeIndex,
+                                         secondDerivativeIndex, categoryRates, categoryWeights,
+                                         outDerivativesForNode,
+                                         outSumDerivativesForNode,
+                                         outSumSquaredDerivativesForNode, threadOffset);
+
+        } else {
+
+            const REALTYPE *postOrderPartial = gPartials[operations[nodeNum * numOpsP]];
+            calcEdgeLogDerivativesPartials(postOrderPartial, preOrderPartial, firstDerivativeIndex,
+                                           secondDerivativeIndex, categoryRates, categoryWeights,
+                                           scalingFactorsIndex,
+                                           outDerivativesForNode,
+                                           outSumDerivativesForNode,
+                                           outSumSquaredDerivativesForNode, threadOffset);
+        }
+
+#ifdef BEAGLE_DEBUG_FLOW
+        std::cerr<<"Almost finished Job = " << operations[nodeNum * numOpsP + 4] <<std::endl;
+#endif
+
+        accumulateDerivatives(outDerivativesForNode,
+                              outSumDerivativesForNode,
+                              outSumSquaredDerivativesForNode, threadOffset);
+
+#ifdef BEAGLE_DEBUG_FLOW
+        std::cerr<<"Finished Job = " << operations[nodeNum * numOpsP + 4] << ", current / count = " << nodeNum << "/" << count <<std::endl;
+#endif
+
+    }
+
+#ifdef BEAGLE_DEBUG_FLOW
+    std::cerr<<"Reached return!" <<std::endl;
+#endif
+
+    return returnCode;
+}
+
+
+
 BEAGLE_CPU_TEMPLATE
 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogDerivatives(const int *postBufferIndices,
                                                               const int *preBufferIndices,
@@ -2022,30 +2221,30 @@ int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogDerivatives(const int *postBuf
         double* outSumSquaredDerivativesForNode = (outSumSquaredDerivatives == NULL) ?
                 NULL : outSumSquaredDerivatives + nodeNum;
 
-        resetDerivativeTemporaries();
+        resetDerivativeTemporaries(0);
 
         if (tipStates != NULL) {
 
             calcEdgeLogDerivativesStates(tipStates, preOrderPartial, firstDerivativeIndex,
-                                        secondDerivativeIndex, categoryRates, categoryWeights,
-                                        outDerivativesForNode,
-                                        outSumDerivativesForNode,
-                                        outSumSquaredDerivativesForNode);
+                                         secondDerivativeIndex, categoryRates, categoryWeights,
+                                         outDerivativesForNode,
+                                         outSumDerivativesForNode,
+                                         outSumSquaredDerivativesForNode, 0);
 
         } else {
 
             const REALTYPE *postOrderPartial = gPartials[postBufferIndices[nodeNum]];
             calcEdgeLogDerivativesPartials(postOrderPartial, preOrderPartial, firstDerivativeIndex,
-                                       secondDerivativeIndex, categoryRates, categoryWeights,
-                                       scalingFactorsIndex,
-                                       outDerivativesForNode,
-                                       outSumDerivativesForNode,
-                                       outSumSquaredDerivativesForNode);
+                                           secondDerivativeIndex, categoryRates, categoryWeights,
+                                           scalingFactorsIndex,
+                                           outDerivativesForNode,
+                                           outSumDerivativesForNode,
+                                           outSumSquaredDerivativesForNode, 0);
         }
 
         accumulateDerivatives(outDerivativesForNode,
-                outSumDerivativesForNode,
-                outSumSquaredDerivativesForNode);
+                              outSumDerivativesForNode,
+                              outSumSquaredDerivativesForNode, 0);
 
     }
 
@@ -2053,16 +2252,16 @@ int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogDerivatives(const int *postBuf
 }
 
 BEAGLE_CPU_TEMPLATE template <bool DoDerivatives, bool DoSum, bool DoSumSquared>
-void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::accumulateDerivativesImpl(
-        double* outDerivatives,
-        double* outSumDerivatives,
-        double* outSumSquaredDerivatives) {
+void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::accumulateDerivativesImpl(double *outDerivatives,
+                                                                  double *outSumDerivatives,
+                                                                  double *outSumSquaredDerivatives,
+                                                                  int offset) {
 
     REALTYPE sum = 0.0;
     REALTYPE sumSquared = 0.0;
 
     for (int k = 0; k < kPatternCount; k++) {
-        REALTYPE derivative = grandNumeratorDerivTmp[k] / grandDenominatorDerivTmp[k];
+        REALTYPE derivative = grandNumeratorDerivTmp[offset + k] / grandDenominatorDerivTmp[offset + k];
         if (DoDerivatives) {
             outDerivatives[k] = derivative;
         }
@@ -2084,65 +2283,69 @@ void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::accumulateDerivativesImpl(
 }
 
 BEAGLE_CPU_TEMPLATE template <bool DoDerivatives, bool DoSum>
-void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::accumulateDerivativesDispatch2(
-        double* outDerivatives,
-        double* outSumDerivatives,
-        double* outSumSquaredDerivatives) {
+void
+BeagleCPUImpl<BEAGLE_CPU_GENERIC>::accumulateDerivativesDispatch2(double *outDerivatives,
+                                                                  double *outSumDerivatives,
+                                                                  double *outSumSquaredDerivatives,
+                                                                  int offset) {
 
     if (outSumSquaredDerivatives == NULL) {
         accumulateDerivativesImpl<DoDerivatives, DoSum, false>(
-                outDerivatives, outSumDerivatives, outSumSquaredDerivatives);
+                outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset);
     } else {
         accumulateDerivativesImpl<DoDerivatives, DoSum, true>(
-                outDerivatives, outSumDerivatives, outSumSquaredDerivatives);
+                outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset);
     }
 }
 
 BEAGLE_CPU_TEMPLATE template <bool DoDerivatives>
-void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::accumulateDerivativesDispatch1(
-        double* outDerivatives,
-        double* outSumDerivatives,
-        double* outSumSquaredDerivatives) {
+void
+BeagleCPUImpl<BEAGLE_CPU_GENERIC>::accumulateDerivativesDispatch1(double *outDerivatives,
+                                                                  double *outSumDerivatives,
+                                                                  double *outSumSquaredDerivatives,
+                                                                  int offset) {
 
     if (outSumDerivatives == NULL) {
         accumulateDerivativesDispatch2<DoDerivatives, false>(
-                outDerivatives, outSumDerivatives, outSumSquaredDerivatives);
+                outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset);
     } else {
         accumulateDerivativesDispatch2<DoDerivatives, true>(
-                outDerivatives, outSumDerivatives, outSumSquaredDerivatives);
+                outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset);
     }
 }
 
 BEAGLE_CPU_TEMPLATE
-void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::resetDerivativeTemporaries() {
-        std::fill(grandNumeratorDerivTmp, grandNumeratorDerivTmp + kPaddedPatternCount, 0);
-        std::fill(grandDenominatorDerivTmp, grandDenominatorDerivTmp + kPaddedPatternCount, 0);
+void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::resetDerivativeTemporaries(int offset) {
+        std::fill(grandNumeratorDerivTmp + offset, grandNumeratorDerivTmp + offset + kPaddedPatternCount, 0);
+        std::fill(grandDenominatorDerivTmp + offset, grandDenominatorDerivTmp + offset + kPaddedPatternCount, 0);
 }
 
 
 BEAGLE_CPU_TEMPLATE
-void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::accumulateDerivatives(double* outDerivatives,
-                                                              double* outSumDerivatives,
-                                                              double* outSumSquaredDerivatives) {
+void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::accumulateDerivatives(double *outDerivatives,
+                                                              double *outSumDerivatives,
+                                                              double *outSumSquaredDerivatives,
+                                                              int offset) {
     if (outDerivatives == NULL) {
         accumulateDerivativesDispatch1<false>(
-                outDerivatives, outSumDerivatives, outSumSquaredDerivatives);
+                outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset);
     } else {
         accumulateDerivativesDispatch1<true>(
-                outDerivatives, outSumDerivatives, outSumSquaredDerivatives);
+                outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset);
     }
 }
 
 BEAGLE_CPU_TEMPLATE
-void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogDerivativesStates(const int *tipStates,
-                                                                     const REALTYPE *preOrderPartial,
-                                                                     const int firstDerivativeIndex,
-                                                                     const int secondDerivativeIndex,
-                                                                     const double *categoryRates,
-                                                                     const REALTYPE *categoryWeights,
-                                                                     double *outDerivatives,
-                                                                     double *outSumDerivatives,
-                                                                     double *outSumSquaredDerivatives) {
+void
+BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogDerivativesStates(const int *tipStates, const REALTYPE *preOrderPartial,
+                                                                const int firstDerivativeIndex,
+                                                                const int secondDerivativeIndex,
+                                                                const double *categoryRates,
+                                                                const REALTYPE *categoryWeights,
+                                                                double *outDerivatives,
+                                                                double *outSumDerivatives,
+                                                                double *outSumSquaredDerivatives,
+                                                                int cacheOffset) {
 
     const REALTYPE *firstDerivMatrix = gTransitionMatrices[firstDerivativeIndex];
 
@@ -2163,8 +2366,8 @@ void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogDerivativesStates(const int *
                              preOrderPartial[patternIndex * kPartialsPaddedStateCount + k];
             }
 
-            grandNumeratorDerivTmp[pattern] += categoryWeights[category] * numerator;
-            grandDenominatorDerivTmp[pattern] += categoryWeights[category] * denominator;
+            grandNumeratorDerivTmp[cacheOffset + pattern] += categoryWeights[category] * numerator;
+            grandDenominatorDerivTmp[cacheOffset + pattern] += categoryWeights[category] * denominator;
         }
     }
 }
@@ -2295,10 +2498,10 @@ void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogDerivativesPartials(const REA
                                                                        const double *categoryRates,
                                                                        const REALTYPE *categoryWeights,
                                                                        const int scalingFactorsIndex,
-//                                                                       const REALTYPE *cumulativeScaleBuffer,
                                                                        double *outDerivatives,
                                                                        double *outSumDerivatives,
-                                                                       double *outSumSquaredDerivatives) {
+                                                                       double *outSumSquaredDerivatives,
+                                                                       int cacheOffset) {
 
     const REALTYPE *firstDerivMatrix = gTransitionMatrices[firstDerivativeIndex];
 
@@ -2329,8 +2532,8 @@ void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogDerivativesPartials(const REA
                 denominator += postOrderPartial[v + k] * preOrderPartial[v + k];
             }
 
-            grandNumeratorDerivTmp[pattern] += weight * numerator;
-            grandDenominatorDerivTmp[pattern] += weight * denominator;
+            grandNumeratorDerivTmp[cacheOffset + pattern] += weight * numerator;
+            grandDenominatorDerivTmp[cacheOffset + pattern] += weight * denominator;
         }
     }
 }