diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index c6afdcaf..837a2b75 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -3,10 +3,10 @@ name: CI on: push: branches: - - master + - "*" pull_request: branches: - - master + - "*" jobs: build: @@ -42,7 +42,7 @@ jobs: uses: uraimo/run-on-arch-action@v2 with: arch: aarch64 - distro: ubuntu20.04 + distro: ubuntu22.04 githubToken: ${{ github.token }} dockerRunArgs: | --volume "${PWD}:/beagle-lib" diff --git a/examples/hmctest/hmctest.cpp b/examples/hmctest/hmctest.cpp index ac3dba6b..056d9e7e 100755 --- a/examples/hmctest/hmctest.cpp +++ b/examples/hmctest/hmctest.cpp @@ -165,8 +165,8 @@ int main( int argc, const char* argv[] ) int nPatterns = strlen(human) * nRepeats; // change # rate category to 2 -// int rateCategoryCount = 4; - int rateCategoryCount = 1; + int rateCategoryCount = 2; +// int rateCategoryCount = 1; int scaleCount = (scaling ? 7 : 0); @@ -207,6 +207,11 @@ int main( int argc, const char* argv[] ) requirementFlags |= BEAGLE_FLAG_VECTOR_NONE; } + int threadCount = 4; + if (threadCount > 1) { + requirementFlags |= BEAGLE_FLAG_THREADING_CPP; + } + // create an instance of the BEAGLE library int instance = beagleCreateInstance( 3, /**< Number of tip data elements (input) */ @@ -228,6 +233,10 @@ int main( int argc, const char* argv[] ) exit(1); } + if (threadCount > 1) { + beagleSetCPUThreadCount(instance, threadCount); + } + int rNumber = instDetails.resourceNumber; fprintf(stdout, "Using resource %i:\n", rNumber); @@ -275,10 +284,10 @@ int main( int argc, const char* argv[] ) //// rates[i] = 3.0 * (i + 1) / (2 * rateCategoryCount + 1); // } -// rates[0] = 0.14251623900062188; -// rates[1] = 1.857483760999378; + rates[0] = 0.14251623900062188; + rates[1] = 1.857483760999378; - rates[0] = 1.0; +// rates[0] = 1.0; beagleSetCategoryRates(instance, &rates[0]); diff --git a/libhmsbeagle/CPU/BeagleCPU4StateImpl.h b/libhmsbeagle/CPU/BeagleCPU4StateImpl.h index 56c4ce8e..3a74f4b1 100644 --- a/libhmsbeagle/CPU/BeagleCPU4StateImpl.h +++ b/libhmsbeagle/CPU/BeagleCPU4StateImpl.h @@ -111,25 +111,27 @@ class BeagleCPU4StateImpl : public BeagleCPUImpl { int endPattern); virtual void calcEdgeLogDerivativesStates(const int *tipStates, - const REALTYPE *preOrderPartial, - const int firstDerivativeIndex, - const int secondDerivativeIndex, - const double *categoryRates, - const REALTYPE *categoryWeights, - double *outDerivatives, - double *outSumDerivatives, - double *outSumSquaredDerivatives); + const REALTYPE *preOrderPartial, + const int firstDerivativeIndex, + const int secondDerivativeIndex, + const double *categoryRates, + const REALTYPE *categoryWeights, + double *outDerivatives, + double *outSumDerivatives, + double *outSumSquaredDerivatives, + int cacheOffset); virtual void calcEdgeLogDerivativesPartials(const REALTYPE *postOrderPartial, - const REALTYPE *preOrderPartial, - const int firstDerivativeIndex, - const int secondDerivativeIndex, - const double *categoryRates, - const REALTYPE *categoryWeights, - const int scalingFactorsIndex, - double *outDerivatives, - double *outSumDerivatives, - double *outSumSquaredDerivatives); + const REALTYPE *preOrderPartial, + const int firstDerivativeIndex, + const int secondDerivativeIndex, + const double *categoryRates, + const REALTYPE *categoryWeights, + const int scalingFactorsIndex, + double *outDerivatives, + double *outSumDerivatives, + double *outSumSquaredDerivatives, + int cacheOffset); virtual void calcCrossProductsStates(const int *tipStates, const REALTYPE *preOrderPartial, diff --git a/libhmsbeagle/CPU/BeagleCPU4StateImpl.hpp b/libhmsbeagle/CPU/BeagleCPU4StateImpl.hpp index e6988194..b2614595 100644 --- a/libhmsbeagle/CPU/BeagleCPU4StateImpl.hpp +++ b/libhmsbeagle/CPU/BeagleCPU4StateImpl.hpp @@ -1008,7 +1008,8 @@ void BeagleCPU4StateImpl::calcEdgeLogDerivativesStates(const const REALTYPE *categoryWeights, double *outDerivatives, double *outSumDerivatives, - double *outSumSquaredDerivatives) { + double *outSumSquaredDerivatives, + int cacheOffset) { for (int category = 0; category < kCategoryCount; category++) { @@ -1031,8 +1032,8 @@ void BeagleCPU4StateImpl::calcEdgeLogDerivativesStates(const REALTYPE denominator = preOrderPartial[localPatternOffset + (state & 3)]; - grandNumeratorDerivTmp[pattern] += categoryWeights[category] * numerator; - grandDenominatorDerivTmp[pattern] += categoryWeights[category] * denominator; + grandNumeratorDerivTmp[cacheOffset + pattern] += categoryWeights[category] * numerator; + grandDenominatorDerivTmp[cacheOffset + pattern] += categoryWeights[category] * denominator; } } } @@ -1047,7 +1048,8 @@ void BeagleCPU4StateImpl::calcEdgeLogDerivativesPartials(con const int scalingFactorsIndex, double *outDerivatives, double *outSumDerivatives, - double *outSumSquaredDerivatives) { + double *outSumSquaredDerivatives, + int cacheOffset) { const REALTYPE* transMatrix = gTransitionMatrices[firstDerivativeIndex]; @@ -1073,8 +1075,8 @@ void BeagleCPU4StateImpl::calcEdgeLogDerivativesPartials(con // grandDenominatorDerivTmp[k] += (postPartials[v] * prePartials[v] + postPartials[v + 1] * prePartials[v + 1] // + postPartials[v + 2] * prePartials[v + 2] + postPartials[v + 3] * prePartials[v + 3]) * weight; - grandDenominatorDerivTmp[k] += (p10 * p00 + p11 * p01 + p12 * p02 + p13 * p03) * weight; - grandNumeratorDerivTmp[k] += (sum10 * p00 + sum11 * p01 + sum12 * p02 + sum13 * p03) * weight; + grandDenominatorDerivTmp[cacheOffset + k] += (p10 * p00 + p11 * p01 + p12 * p02 + p13 * p03) * weight; + grandNumeratorDerivTmp[cacheOffset + k] += (sum10 * p00 + sum11 * p01 + sum12 * p02 + sum13 * p03) * weight; v += 4; } diff --git a/libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.h b/libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.h index d1881311..348ffc72 100644 --- a/libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.h +++ b/libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.h @@ -165,26 +165,27 @@ class BeagleCPU4StateSSEImpl : public BeagleCPU4StateIm protected: virtual int getPaddedPatternsModulus(); - virtual void accumulateDerivatives(double* outDerivatives, - double* outSumDerivatives, - double* outSumSquaredDerivatives); + virtual void accumulateDerivatives(double *outDerivatives, double *outSumDerivatives, + double *outSumSquaredDerivatives, int offset); private: template - void accumulateDerivativesDispatch1(double* outDerivatives, - double* outSumDerivatives, - double* outSumSquaredDerivatives); + void + accumulateDerivativesDispatch1(double *outDerivatives, double *outSumDerivatives, double *outSumSquaredDerivatives, + int offset); template - void accumulateDerivativesDispatch2(double* outDerivatives, - double* outSumDerivatives, - double* outSumSquaredDerivatives); + void accumulateDerivativesDispatch2(double *outDerivatives, + double *outSumDerivatives, + double *outSumSquaredDerivatives, + int offset); template - void accumulateDerivativesImpl(double* outDerivatives, - double* outSumDerivatives, - double* outSumSquaredDerivatives); + void accumulateDerivativesImpl(double *outDerivatives, + double *outSumDerivatives, + double *outSumSquaredDerivatives, + int offset); virtual void calcStatesStates(double* destP, const int* states1, @@ -260,7 +261,8 @@ class BeagleCPU4StateSSEImpl : public BeagleCPU4StateIm const int scalingFactorsIndex, double* outDerivatives, double* outSumDerivatives, - double* outSumSquaredDerivatives); + double* outSumSquaredDerivatives, + int cacheOffset); virtual void calcEdgeLogDerivativesStates(const int* tipStates, const double *__restrict preOrderPartial, @@ -270,7 +272,8 @@ class BeagleCPU4StateSSEImpl : public BeagleCPU4StateIm const double* __restrict categoryWeights, double *outDerivatives, double *outSumDerivatives, - double *outSumSquaredDerivatives); + double *outSumSquaredDerivatives, + int cacheOffset); virtual void calcPartialsPartialsFixedScaling(double* __restrict destP, const double* __restrict child0Partials, diff --git a/libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.hpp b/libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.hpp index 52f3b6f8..69dc8ce9 100644 --- a/libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.hpp +++ b/libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.hpp @@ -639,10 +639,10 @@ void BeagleCPU4StateSSEImpl::calcCrossProductsStates(co } BEAGLE_CPU_4_SSE_TEMPLATE template -void BeagleCPU4StateSSEImpl::accumulateDerivativesImpl( - double* outDerivatives, - double* outSumDerivatives, - double* outSumSquaredDerivatives) { +void BeagleCPU4StateSSEImpl::accumulateDerivativesImpl(double *outDerivatives, + double *outSumDerivatives, + double *outSumSquaredDerivatives, + int offset) { V_Real vSum = VEC_SETZERO(); V_Real vSumSquared = VEC_SETZERO(); @@ -650,8 +650,8 @@ void BeagleCPU4StateSSEImpl::accumulateDerivativesImpl( int k = 0; for (; k < kPatternCount - 1; k += 2) { - V_Real numerator = VEC_LOAD(grandNumeratorDerivTmp + k); - V_Real denominator = VEC_LOAD(grandDenominatorDerivTmp + k); + V_Real numerator = VEC_LOAD(grandNumeratorDerivTmp + k + offset); + V_Real denominator = VEC_LOAD(grandDenominatorDerivTmp + k + offset); V_Real derivative = VEC_DIV(numerator, denominator); V_Real patternWeight = VEC_LOAD(gPatternWeights + k); @@ -681,7 +681,7 @@ void BeagleCPU4StateSSEImpl::accumulateDerivativesImpl( } for (; k < kPatternCount; ++k) { - double derivative = grandNumeratorDerivTmp[k] / grandDenominatorDerivTmp[k]; + double derivative = grandNumeratorDerivTmp[k + offset] / grandDenominatorDerivTmp[k + offset]; if (DoDerivatives) { outDerivatives[k] = derivative; } @@ -703,46 +703,47 @@ void BeagleCPU4StateSSEImpl::accumulateDerivativesImpl( } BEAGLE_CPU_4_SSE_TEMPLATE template -void BeagleCPU4StateSSEImpl::accumulateDerivativesDispatch2( - double* outDerivatives, - double* outSumDerivatives, - double* outSumSquaredDerivatives) { +void BeagleCPU4StateSSEImpl::accumulateDerivativesDispatch2(double *outDerivatives, + double *outSumDerivatives, + double *outSumSquaredDerivatives, + int offset) { if (outSumSquaredDerivatives == NULL) { accumulateDerivativesImpl( - outDerivatives, outSumDerivatives, outSumSquaredDerivatives); + outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset); } else { accumulateDerivativesImpl( - outDerivatives, outSumDerivatives, outSumSquaredDerivatives); + outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset); } } BEAGLE_CPU_4_SSE_TEMPLATE template -void BeagleCPU4StateSSEImpl::accumulateDerivativesDispatch1( - double* outDerivatives, - double* outSumDerivatives, - double* outSumSquaredDerivatives) { +void BeagleCPU4StateSSEImpl::accumulateDerivativesDispatch1(double *outDerivatives, + double *outSumDerivatives, + double *outSumSquaredDerivatives, + int offset) { if (outSumDerivatives == NULL) { accumulateDerivativesDispatch2( - outDerivatives, outSumDerivatives, outSumSquaredDerivatives); + outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset); } else { accumulateDerivativesDispatch2( - outDerivatives, outSumDerivatives, outSumSquaredDerivatives); + outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset); } } BEAGLE_CPU_4_SSE_TEMPLATE -void BeagleCPU4StateSSEImpl::accumulateDerivatives(double* outDerivatives, - double* outSumDerivatives, - double* outSumSquaredDerivatives) { +void BeagleCPU4StateSSEImpl::accumulateDerivatives(double *outDerivatives, + double *outSumDerivatives, + double *outSumSquaredDerivatives, + int offset) { if (outDerivatives == NULL) { accumulateDerivativesDispatch1( - outDerivatives, outSumDerivatives, outSumSquaredDerivatives); + outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset); } else { accumulateDerivativesDispatch1( - outDerivatives, outSumDerivatives, outSumSquaredDerivatives); + outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset); } } @@ -756,7 +757,8 @@ void BeagleCPU4StateSSEImpl::calcEdgeLogDerivativesPart const int scalingFactorsIndex, double* outDerivatives, double* outSumDerivatives, - double* outSumSquaredDerivatives) { + double* outSumSquaredDerivatives, + int cacheOffset) { double* cl_p = integrationTmp; memset(cl_p, 0, (kPatternCount * kStateCount)*sizeof(double)); @@ -809,8 +811,8 @@ void BeagleCPU4StateSSEImpl::calcEdgeLogDerivativesPart double numer = _mm_cvtsd_f64(vnumer) * wt[l]; double denon = _mm_cvtsd_f64(vdenom) * wt[l]; - grandNumeratorDerivTmp[k] += numer; // TODO Merge [numer, denom] into single SSE transactions - grandDenominatorDerivTmp[k] += denon; + grandNumeratorDerivTmp[cacheOffset + k] += numer; // TODO Merge [numer, denom] into single SSE transactions + grandDenominatorDerivTmp[cacheOffset + k] += denon; v += 4; } @@ -831,7 +833,8 @@ void BeagleCPU4StateSSEImpl::calcEdgeLogDerivativesStat const double* categoryWeights, double *outDerivatives, double *outSumDerivatives, - double *outSumSquaredDerivatives) { + double *outSumSquaredDerivatives, + int cacheOffset) { double* cl_p = integrationTmp; memset(cl_p, 0, (kPatternCount * kStateCount)*sizeof(double)); @@ -866,8 +869,8 @@ void BeagleCPU4StateSSEImpl::calcEdgeLogDerivativesStat double numer = _mm_cvtsd_f64(vnumer); double denom = cl_r[stateChild & 3]; cl_r += 4; - grandNumeratorDerivTmp[k] += numer * wt[l]; - grandDenominatorDerivTmp[k] += denom * wt[l]; + grandNumeratorDerivTmp[cacheOffset + k] += numer * wt[l]; + grandDenominatorDerivTmp[cacheOffset + k] += denom * wt[l]; } w += OFFSET*4; vcl_r += 2 * kExtraPatterns; diff --git a/libhmsbeagle/CPU/BeagleCPUImpl.h b/libhmsbeagle/CPU/BeagleCPUImpl.h index 04d82974..b9751cbb 100644 --- a/libhmsbeagle/CPU/BeagleCPUImpl.h +++ b/libhmsbeagle/CPU/BeagleCPUImpl.h @@ -45,8 +45,8 @@ // TODO: assess following cut-offs dynamically #define BEAGLE_CPU_ASYNC_HW_THREAD_COUNT_THRESHOLD 16 // CPU category threshold -#define BEAGLE_CPU_ASYNC_MIN_PATTERN_COUNT_LOW 256 // do not use CPU auto-threading for problems with fewer patterns on CPUs with many cores -#define BEAGLE_CPU_ASYNC_MIN_PATTERN_COUNT_HIGH 768 // do not use CPU auto-threading for problems with fewer patterns on CPUs with few cores +#define BEAGLE_CPU_ASYNC_MIN_PATTERN_COUNT_LOW 256 // do not use CPU auto-threading for problems with fewer patterns on CPUs with many cores +#define BEAGLE_CPU_ASYNC_MIN_PATTERN_COUNT_HIGH 768 // do not use CPU auto-threading for problems with fewer patterns on CPUs with few cores #define BEAGLE_CPU_ASYNC_LIMIT_PATTERN_COUNT 262144 // do not use all CPU cores for problems with fewer patterns namespace beagle { @@ -462,6 +462,13 @@ class BeagleCPUImpl : public BeagleImpl { int count, int cumulativeScaleIndex); + + virtual int calcEdgeLogDerivativesByAutoPartitionAsync(const int *operations, + int count, + double *siteLogLikelihoods, + double *outLogFirstDerivatives, + double *outLogDiagonalSecondDerivatives); + virtual int calcEdgeLogDerivatives(const int *postBufferIndices, const int *preBufferIndices, const int *firstDerivativeIndices, @@ -474,16 +481,15 @@ class BeagleCPUImpl : public BeagleImpl { double *outLogFirstDerivatives, double *outLogDiagonalSecondDerivatives); - virtual void calcEdgeLogDerivativesStates(const int *tipStates, - const REALTYPE *preOrderPartial, + virtual void calcEdgeLogDerivativesStates(const int *tipStates, const REALTYPE *preOrderPartial, const int firstDerivativeIndex, const int secondDerivativeIndex, const double *categoryRates, const REALTYPE *categoryWeights, -// const REALTYPE *cumulativeScaleBuffer, double *siteLogLikelihoods, double *outLogFirstDerivatives, - double *outLogDiagonalSecondDerivatives); + double *outLogDiagonalSecondDerivatives, + int cacheOffset); virtual void calcEdgeLogDerivativesPartials(const REALTYPE *postOrderPartial, const REALTYPE *preOrderPartial, @@ -492,10 +498,10 @@ class BeagleCPUImpl : public BeagleImpl { const double *categoryRates, const REALTYPE *categoryWeights, const int scalingFactorsIndex, -// const REALTYPE *cumulativeScaleBuffer, double *siteLogLikelihoods, double *outLogFirstDerivatives, - double *outLogDiagonalSecondDerivatives); + double *outLogDiagonalSecondDerivatives, + int cacheOffset); virtual int calcCrossProducts(const int *postBufferIndices, const int *preBufferIndices, @@ -522,11 +528,10 @@ class BeagleCPUImpl : public BeagleImpl { double *outCrossProducts, double *outSumSquaredDerivatives); - virtual void resetDerivativeTemporaries(); + virtual void resetDerivativeTemporaries(int offset); - virtual void accumulateDerivatives(double* outDerivatives, - double* outSumDerivatives, - double* outSumSquaredDerivatives); + virtual void accumulateDerivatives(double *outDerivatives, double *outSumDerivatives, + double *outSumSquaredDerivatives, int offset); virtual void autoPartitionPartialsOperations(const int* operations, int* partitionOperations, @@ -772,19 +777,17 @@ class BeagleCPUImpl : public BeagleImpl { private: template - void accumulateDerivativesDispatch1(double* outDerivatives, - double* outSumDerivatives, - double* outSumSquaredDerivatives); + void + accumulateDerivativesDispatch1(double *outDerivatives, double *outSumDerivatives, double *outSumSquaredDerivatives, + int offset); template - void accumulateDerivativesDispatch2(double* outDerivatives, - double* outSumDerivatives, - double* outSumSquaredDerivatives); + void accumulateDerivativesDispatch2(double *outDerivatives, double *outSumDerivatives, + double *outSumSquaredDerivatives, int offset); template - void accumulateDerivativesImpl(double* outDerivatives, - double* outSumDerivatives, - double* outSumSquaredDerivatives); + void accumulateDerivativesImpl(double *outDerivatives, double *outSumDerivatives, + double *outSumSquaredDerivatives, int offset); }; BEAGLE_CPU_FACTORY_TEMPLATE diff --git a/libhmsbeagle/CPU/BeagleCPUImpl.hpp b/libhmsbeagle/CPU/BeagleCPUImpl.hpp index 8cf0349d..0e48881a 100644 --- a/libhmsbeagle/CPU/BeagleCPUImpl.hpp +++ b/libhmsbeagle/CPU/BeagleCPUImpl.hpp @@ -809,6 +809,11 @@ int BeagleCPUImpl::setPatternPartitions(int partitionCount, assert(inPatternPartitions != 0L); kPartitionCount = partitionCount; + grandDenominatorDerivTmp = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * (kPaddedPatternCount + 1) * kPartitionCount); + grandNumeratorDerivTmp = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * (kPaddedPatternCount + 1)* kPartitionCount); + firstDerivTmp = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * kPaddedPatternCount * kStateCount * kPartitionCount); + secondDerivTmp = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * kPaddedPatternCount * kStateCount * kPartitionCount); + if (!kPartitionsInitialised) { gPatternPartitions = (int*) malloc(sizeof(int) * kPatternCount); @@ -1329,10 +1334,13 @@ int BeagleCPUImpl::updatePrePartials(const int *operations, cumulativeScaleIndex); count *= kPartitionCount; returnCode = upPrePartialsByPartitionAsync((const int*) gAutoPartitionOperations, - count); + count); } else { bool byPartition = false; - returnCode = upPrePartials(byPartition, operations, count, cumulativeScaleIndex); + returnCode = upPrePartials(byPartition, + operations, + count, + cumulativeScaleIndex); } return returnCode; @@ -1373,13 +1381,111 @@ int BeagleCPUImpl::calculateEdgeDerivatives(const int *postB double *outDerivatives, double *outSumDerivatives, double *outSumSquaredDerivatives) { - return calcEdgeLogDerivatives( - postBufferIndices, preBufferIndices, - derivativeMatrixIndices, NULL, - categoryWeightsIndices,categoryRatesIndices, cumulativeScaleIndices, - count, - outDerivatives, - outSumDerivatives, outSumSquaredDerivatives); + int returnCode = BEAGLE_ERROR_GENERAL; + + if (kAutoPartitioningEnabled && kPartitionCount > 1) { + + + + int numOps = BEAGLE_OP_COUNT; + int numOpsP = BEAGLE_PARTITION_OP_COUNT; + + int perThreadCount = (count - (count % kPartitionCount)) / kPartitionCount; + +#ifdef BEAGLE_DEBUG_FLOW + std::cerr<<"Mark 0" << ", Original count = "<< count << ", perThreadCount = " << perThreadCount < threadTask( + std::bind(&BeagleCPUImpl::calcEdgeLogDerivativesByAutoPartitionAsync, this, + (const int*) gThreadOperations[i], + gThreadOpCounts[i], + outDerivatives, + outSumDerivatives, + outSumSquaredDerivatives)); + + gFutures[i] = threadTask.get_future(); + threadData* td = &gThreads[i]; + + std::unique_lock l(td->m); + td->jobs.push(std::move(threadTask)); + l.unlock(); + + gThreads[i].cv.notify_one(); + } + + for (int i=0; i::updatePrePartialsByPartition(const int* o int returnCode = BEAGLE_ERROR_GENERAL; if (kThreadingEnabled) { -// TODO -// returnCode = upPrePartialsByPartitionAsync(operations, -// count); + returnCode = upPrePartialsByPartitionAsync(operations, + count); } else { bool byPartition = true; returnCode = upPrePartials(byPartition, @@ -1987,6 +2092,100 @@ int BeagleCPUImpl::calcCrossProducts(const int *postBufferIn return returnCode; } +BEAGLE_CPU_TEMPLATE +int BeagleCPUImpl::calcEdgeLogDerivativesByAutoPartitionAsync(const int *operations, + int count, + double *outDerivatives, + double *outSumDerivatives, + double *outSumSquaredDerivatives) { + + int returnCode = BEAGLE_SUCCESS; + + int numOps = BEAGLE_OP_COUNT; + int numOpsP = BEAGLE_PARTITION_OP_COUNT; + + const int secondDerivativeIndex = BEAGLE_OP_NONE; + const double *categoryRates = NULL; // gCategoryRates[categoryRatesIndices[0]]; // TODO Generalize + const REALTYPE *categoryWeights = gCategoryWeights[operations[3]]; // TODO Generalize + + + + for (int nodeNum = 0; nodeNum < count; nodeNum++) { + + const REALTYPE *preOrderPartial = gPartials[operations[nodeNum * numOpsP + 1]]; + const int *tipStates = gTipStates[operations[nodeNum * numOpsP]]; + + const int firstDerivativeIndex = operations[nodeNum * numOpsP + 2]; + const int scalingFactorsIndex = -1; // cumulativeScaleIndices[nodeNum]; + + const int patternOffset = operations[nodeNum * numOpsP + 4] * kPatternCount; + const int threadOffset = operations[nodeNum * numOpsP + 5] * ((kPaddedPatternCount + 1) / 2 * 2); + + +#ifdef BEAGLE_DEBUG_FLOW + std::cerr<<"Job = " << operations[nodeNum * numOpsP + 4] <::calcEdgeLogDerivatives(const int *postBufferIndices, const int *preBufferIndices, @@ -2022,30 +2221,30 @@ int BeagleCPUImpl::calcEdgeLogDerivatives(const int *postBuf double* outSumSquaredDerivativesForNode = (outSumSquaredDerivatives == NULL) ? NULL : outSumSquaredDerivatives + nodeNum; - resetDerivativeTemporaries(); + resetDerivativeTemporaries(0); if (tipStates != NULL) { calcEdgeLogDerivativesStates(tipStates, preOrderPartial, firstDerivativeIndex, - secondDerivativeIndex, categoryRates, categoryWeights, - outDerivativesForNode, - outSumDerivativesForNode, - outSumSquaredDerivativesForNode); + secondDerivativeIndex, categoryRates, categoryWeights, + outDerivativesForNode, + outSumDerivativesForNode, + outSumSquaredDerivativesForNode, 0); } else { const REALTYPE *postOrderPartial = gPartials[postBufferIndices[nodeNum]]; calcEdgeLogDerivativesPartials(postOrderPartial, preOrderPartial, firstDerivativeIndex, - secondDerivativeIndex, categoryRates, categoryWeights, - scalingFactorsIndex, - outDerivativesForNode, - outSumDerivativesForNode, - outSumSquaredDerivativesForNode); + secondDerivativeIndex, categoryRates, categoryWeights, + scalingFactorsIndex, + outDerivativesForNode, + outSumDerivativesForNode, + outSumSquaredDerivativesForNode, 0); } accumulateDerivatives(outDerivativesForNode, - outSumDerivativesForNode, - outSumSquaredDerivativesForNode); + outSumDerivativesForNode, + outSumSquaredDerivativesForNode, 0); } @@ -2053,16 +2252,16 @@ int BeagleCPUImpl::calcEdgeLogDerivatives(const int *postBuf } BEAGLE_CPU_TEMPLATE template -void BeagleCPUImpl::accumulateDerivativesImpl( - double* outDerivatives, - double* outSumDerivatives, - double* outSumSquaredDerivatives) { +void BeagleCPUImpl::accumulateDerivativesImpl(double *outDerivatives, + double *outSumDerivatives, + double *outSumSquaredDerivatives, + int offset) { REALTYPE sum = 0.0; REALTYPE sumSquared = 0.0; for (int k = 0; k < kPatternCount; k++) { - REALTYPE derivative = grandNumeratorDerivTmp[k] / grandDenominatorDerivTmp[k]; + REALTYPE derivative = grandNumeratorDerivTmp[offset + k] / grandDenominatorDerivTmp[offset + k]; if (DoDerivatives) { outDerivatives[k] = derivative; } @@ -2084,65 +2283,69 @@ void BeagleCPUImpl::accumulateDerivativesImpl( } BEAGLE_CPU_TEMPLATE template -void BeagleCPUImpl::accumulateDerivativesDispatch2( - double* outDerivatives, - double* outSumDerivatives, - double* outSumSquaredDerivatives) { +void +BeagleCPUImpl::accumulateDerivativesDispatch2(double *outDerivatives, + double *outSumDerivatives, + double *outSumSquaredDerivatives, + int offset) { if (outSumSquaredDerivatives == NULL) { accumulateDerivativesImpl( - outDerivatives, outSumDerivatives, outSumSquaredDerivatives); + outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset); } else { accumulateDerivativesImpl( - outDerivatives, outSumDerivatives, outSumSquaredDerivatives); + outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset); } } BEAGLE_CPU_TEMPLATE template -void BeagleCPUImpl::accumulateDerivativesDispatch1( - double* outDerivatives, - double* outSumDerivatives, - double* outSumSquaredDerivatives) { +void +BeagleCPUImpl::accumulateDerivativesDispatch1(double *outDerivatives, + double *outSumDerivatives, + double *outSumSquaredDerivatives, + int offset) { if (outSumDerivatives == NULL) { accumulateDerivativesDispatch2( - outDerivatives, outSumDerivatives, outSumSquaredDerivatives); + outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset); } else { accumulateDerivativesDispatch2( - outDerivatives, outSumDerivatives, outSumSquaredDerivatives); + outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset); } } BEAGLE_CPU_TEMPLATE -void BeagleCPUImpl::resetDerivativeTemporaries() { - std::fill(grandNumeratorDerivTmp, grandNumeratorDerivTmp + kPaddedPatternCount, 0); - std::fill(grandDenominatorDerivTmp, grandDenominatorDerivTmp + kPaddedPatternCount, 0); +void BeagleCPUImpl::resetDerivativeTemporaries(int offset) { + std::fill(grandNumeratorDerivTmp + offset, grandNumeratorDerivTmp + offset + kPaddedPatternCount, 0); + std::fill(grandDenominatorDerivTmp + offset, grandDenominatorDerivTmp + offset + kPaddedPatternCount, 0); } BEAGLE_CPU_TEMPLATE -void BeagleCPUImpl::accumulateDerivatives(double* outDerivatives, - double* outSumDerivatives, - double* outSumSquaredDerivatives) { +void BeagleCPUImpl::accumulateDerivatives(double *outDerivatives, + double *outSumDerivatives, + double *outSumSquaredDerivatives, + int offset) { if (outDerivatives == NULL) { accumulateDerivativesDispatch1( - outDerivatives, outSumDerivatives, outSumSquaredDerivatives); + outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset); } else { accumulateDerivativesDispatch1( - outDerivatives, outSumDerivatives, outSumSquaredDerivatives); + outDerivatives, outSumDerivatives, outSumSquaredDerivatives, offset); } } BEAGLE_CPU_TEMPLATE -void BeagleCPUImpl::calcEdgeLogDerivativesStates(const int *tipStates, - const REALTYPE *preOrderPartial, - const int firstDerivativeIndex, - const int secondDerivativeIndex, - const double *categoryRates, - const REALTYPE *categoryWeights, - double *outDerivatives, - double *outSumDerivatives, - double *outSumSquaredDerivatives) { +void +BeagleCPUImpl::calcEdgeLogDerivativesStates(const int *tipStates, const REALTYPE *preOrderPartial, + const int firstDerivativeIndex, + const int secondDerivativeIndex, + const double *categoryRates, + const REALTYPE *categoryWeights, + double *outDerivatives, + double *outSumDerivatives, + double *outSumSquaredDerivatives, + int cacheOffset) { const REALTYPE *firstDerivMatrix = gTransitionMatrices[firstDerivativeIndex]; @@ -2163,8 +2366,8 @@ void BeagleCPUImpl::calcEdgeLogDerivativesStates(const int * preOrderPartial[patternIndex * kPartialsPaddedStateCount + k]; } - grandNumeratorDerivTmp[pattern] += categoryWeights[category] * numerator; - grandDenominatorDerivTmp[pattern] += categoryWeights[category] * denominator; + grandNumeratorDerivTmp[cacheOffset + pattern] += categoryWeights[category] * numerator; + grandDenominatorDerivTmp[cacheOffset + pattern] += categoryWeights[category] * denominator; } } } @@ -2295,10 +2498,10 @@ void BeagleCPUImpl::calcEdgeLogDerivativesPartials(const REA const double *categoryRates, const REALTYPE *categoryWeights, const int scalingFactorsIndex, -// const REALTYPE *cumulativeScaleBuffer, double *outDerivatives, double *outSumDerivatives, - double *outSumSquaredDerivatives) { + double *outSumSquaredDerivatives, + int cacheOffset) { const REALTYPE *firstDerivMatrix = gTransitionMatrices[firstDerivativeIndex]; @@ -2329,8 +2532,8 @@ void BeagleCPUImpl::calcEdgeLogDerivativesPartials(const REA denominator += postOrderPartial[v + k] * preOrderPartial[v + k]; } - grandNumeratorDerivTmp[pattern] += weight * numerator; - grandDenominatorDerivTmp[pattern] += weight * denominator; + grandNumeratorDerivTmp[cacheOffset + pattern] += weight * numerator; + grandDenominatorDerivTmp[cacheOffset + pattern] += weight * denominator; } } }